github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/pgalloc/pgalloc.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package pgalloc contains the page allocator subsystem, which manages memory
    16  // that may be mapped into application address spaces.
    17  //
    18  // Lock order:
    19  //
    20  //	 pgalloc.MemoryFile.mu
    21  //		pgalloc.MemoryFile.mappingsMu
    22  package pgalloc
    23  
    24  import (
    25  	"fmt"
    26  	"math"
    27  	"os"
    28  	"sync/atomic"
    29  	"time"
    30  
    31  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    32  	"github.com/MerlinKodo/gvisor/pkg/atomicbitops"
    33  	"github.com/MerlinKodo/gvisor/pkg/context"
    34  	"github.com/MerlinKodo/gvisor/pkg/errors/linuxerr"
    35  	"github.com/MerlinKodo/gvisor/pkg/hostarch"
    36  	"github.com/MerlinKodo/gvisor/pkg/log"
    37  	"github.com/MerlinKodo/gvisor/pkg/safemem"
    38  	"github.com/MerlinKodo/gvisor/pkg/sentry/hostmm"
    39  	"github.com/MerlinKodo/gvisor/pkg/sentry/memmap"
    40  	"github.com/MerlinKodo/gvisor/pkg/sentry/usage"
    41  	"github.com/MerlinKodo/gvisor/pkg/sync"
    42  	"golang.org/x/sys/unix"
    43  )
    44  
    45  // Direction describes how to allocate offsets from MemoryFile.
    46  type Direction int
    47  
    48  const (
    49  	// BottomUp allocates offsets in increasing offsets.
    50  	BottomUp Direction = iota
    51  	// TopDown allocates offsets in decreasing offsets.
    52  	TopDown
    53  )
    54  
    55  // String implements fmt.Stringer.
    56  func (d Direction) String() string {
    57  	switch d {
    58  	case BottomUp:
    59  		return "up"
    60  	case TopDown:
    61  		return "down"
    62  	}
    63  	panic(fmt.Sprintf("invalid direction: %d", d))
    64  }
    65  
    66  // MemoryFile is a memmap.File whose pages may be allocated to arbitrary
    67  // users.
    68  type MemoryFile struct {
    69  	// opts holds options passed to NewMemoryFile. opts is immutable.
    70  	opts MemoryFileOpts
    71  
    72  	// MemoryFile owns a single backing file, which is modeled as follows:
    73  	//
    74  	// Each page in the file can be committed or uncommitted. A page is
    75  	// committed if the host kernel is spending resources to store its contents
    76  	// and uncommitted otherwise. This definition includes pages that the host
    77  	// kernel has swapped; this is intentional, to ensure that accounting does
    78  	// not change even if host kernel swapping behavior changes, and that
    79  	// memory used by pseudo-swap mechanisms like zswap is still accounted.
    80  	//
    81  	// The initial contents of uncommitted pages are implicitly zero bytes. A
    82  	// read or write to the contents of an uncommitted page causes it to be
    83  	// committed. This is the only event that can cause a uncommitted page to
    84  	// be committed.
    85  	//
    86  	// fallocate(FALLOC_FL_PUNCH_HOLE) (MemoryFile.Decommit) causes committed
    87  	// pages to be uncommitted. This is the only event that can cause a
    88  	// committed page to be uncommitted.
    89  	//
    90  	// Memory accounting is based on identifying the set of committed pages.
    91  	// Since we do not have direct access to the MMU, tracking reads and writes
    92  	// to uncommitted pages to detect commitment would introduce additional
    93  	// page faults, which would be prohibitively expensive. Instead, we query
    94  	// the host kernel to determine which pages are committed.
    95  
    96  	// file is the backing file. The file pointer is immutable.
    97  	file *os.File
    98  
    99  	mu memoryFileMutex
   100  
   101  	// usage maps each page in the file to metadata for that page. Pages for
   102  	// which no segment exists in usage are both unallocated (not in use) and
   103  	// uncommitted.
   104  	//
   105  	// Since usage stores usageInfo objects by value, clients should usually
   106  	// use usageIterator.ValuePtr() instead of usageIterator.Value() to get a
   107  	// pointer to the usageInfo rather than a copy.
   108  	//
   109  	// usage must be kept maximally merged (that is, there should never be two
   110  	// adjacent segments with the same values). At least markReclaimed depends
   111  	// on this property.
   112  	//
   113  	// usage is protected by mu.
   114  	usage usageSet
   115  
   116  	// The UpdateUsage function scans all segments with knownCommitted set
   117  	// to false, sees which pages are committed and creates corresponding
   118  	// segments with knownCommitted set to true.
   119  	//
   120  	// In order to avoid unnecessary scans, usageExpected tracks the total
   121  	// file blocks expected. This is used to elide the scan when this
   122  	// matches the underlying file blocks.
   123  	//
   124  	// To track swapped pages, usageSwapped tracks the discrepency between
   125  	// what is observed in core and what is reported by the file. When
   126  	// usageSwapped is non-zero, a sweep will be performed at least every
   127  	// second. The start of the last sweep is recorded in usageLast.
   128  	//
   129  	// All usage attributes are all protected by mu.
   130  	usageExpected uint64
   131  	usageSwapped  uint64
   132  	usageLast     time.Time
   133  
   134  	// fileSize is the size of the backing memory file in bytes. fileSize is
   135  	// always a power-of-two multiple of chunkSize.
   136  	//
   137  	// fileSize is protected by mu.
   138  	fileSize int64
   139  
   140  	// Pages from the backing file are mapped into the local address space on
   141  	// the granularity of large pieces called chunks. mappings is a []uintptr
   142  	// that stores, for each chunk, the start address of a mapping of that
   143  	// chunk in the current process' address space, or 0 if no such mapping
   144  	// exists. Once a chunk is mapped, it is never remapped or unmapped until
   145  	// the MemoryFile is destroyed.
   146  	//
   147  	// Mutating the mappings slice or its contents requires both holding
   148  	// mappingsMu and using atomic memory operations. (The slice is mutated
   149  	// whenever the file is expanded. Per the above, the only permitted
   150  	// mutation of the slice's contents is the assignment of a mapping to a
   151  	// chunk that was previously unmapped.) Reading the slice or its contents
   152  	// only requires *either* holding mappingsMu or using atomic memory
   153  	// operations. This allows MemoryFile.MapInternal to avoid locking in the
   154  	// common case where chunk mappings already exist.
   155  	mappingsMu mappingsMutex
   156  	mappings   atomic.Value
   157  
   158  	// destroyed is set by Destroy to instruct the reclaimer goroutine to
   159  	// release resources and exit. destroyed is protected by mu.
   160  	destroyed bool
   161  
   162  	// reclaimable is true if usage may contain reclaimable pages. reclaimable
   163  	// is protected by mu.
   164  	reclaimable bool
   165  
   166  	// reclaim is the collection of regions for reclaim. reclaim is protected
   167  	// by mu.
   168  	reclaim reclaimSet
   169  
   170  	// reclaimCond is signaled (with mu locked) when reclaimable or destroyed
   171  	// transitions from false to true.
   172  	reclaimCond sync.Cond
   173  
   174  	// evictable maps EvictableMemoryUsers to eviction state.
   175  	//
   176  	// evictable is protected by mu.
   177  	evictable map[EvictableMemoryUser]*evictableMemoryUserInfo
   178  
   179  	// evictionWG counts the number of goroutines currently performing evictions.
   180  	evictionWG sync.WaitGroup
   181  
   182  	// stopNotifyPressure stops memory cgroup pressure level
   183  	// notifications used to drive eviction. stopNotifyPressure is
   184  	// immutable.
   185  	stopNotifyPressure func()
   186  }
   187  
   188  // MemoryFileOpts provides options to NewMemoryFile.
   189  type MemoryFileOpts struct {
   190  	// DelayedEviction controls the extent to which the MemoryFile may delay
   191  	// eviction of evictable allocations.
   192  	DelayedEviction DelayedEvictionType
   193  
   194  	// If UseHostMemcgPressure is true, use host memory cgroup pressure level
   195  	// notifications to determine when eviction is necessary. This option has
   196  	// no effect unless DelayedEviction is DelayedEvictionEnabled.
   197  	UseHostMemcgPressure bool
   198  
   199  	// DecommitOnDestroy indicates whether the entire host file should be
   200  	// decommitted on destruction. This is appropriate for host filesystem based
   201  	// files that need to be explicitly cleaned up to release disk space.
   202  	DecommitOnDestroy bool
   203  
   204  	// If ManualZeroing is true, MemoryFile must not assume that new pages
   205  	// obtained from the host are zero-filled, such that MemoryFile must manually
   206  	// zero newly-allocated pages.
   207  	ManualZeroing bool
   208  
   209  	// If DisableIMAWorkAround is true, NewMemoryFile will not call
   210  	// IMAWorkAroundForMemFile().
   211  	DisableIMAWorkAround bool
   212  
   213  	// DiskBackedFile indicates that the MemoryFile is backed by a file on disk.
   214  	DiskBackedFile bool
   215  }
   216  
   217  // DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction.
   218  type DelayedEvictionType int
   219  
   220  const (
   221  	// DelayedEvictionDefault has unspecified behavior.
   222  	DelayedEvictionDefault DelayedEvictionType = iota
   223  
   224  	// DelayedEvictionDisabled requires that evictable allocations are evicted
   225  	// as soon as possible.
   226  	DelayedEvictionDisabled
   227  
   228  	// DelayedEvictionEnabled requests that the MemoryFile delay eviction of
   229  	// evictable allocations until doing so is considered necessary to avoid
   230  	// performance degradation due to host memory pressure, or OOM kills.
   231  	//
   232  	// As of this writing, the behavior of DelayedEvictionEnabled depends on
   233  	// whether or not MemoryFileOpts.UseHostMemcgPressure is enabled:
   234  	//
   235  	//	- If UseHostMemcgPressure is true, evictions are delayed until memory
   236  	//		pressure is indicated.
   237  	//
   238  	//	- Otherwise, evictions are only delayed until the reclaimer goroutine
   239  	//		is out of work (pages to reclaim).
   240  	DelayedEvictionEnabled
   241  
   242  	// DelayedEvictionManual requires that evictable allocations are only
   243  	// evicted when MemoryFile.StartEvictions() is called. This is extremely
   244  	// dangerous outside of tests.
   245  	DelayedEvictionManual
   246  )
   247  
   248  // usageInfo tracks usage information.
   249  //
   250  // +stateify savable
   251  type usageInfo struct {
   252  	// kind is the usage kind.
   253  	kind usage.MemoryKind
   254  
   255  	// knownCommitted is true if the tracked region is definitely committed.
   256  	// (If it is false, the tracked region may or may not be committed.)
   257  	knownCommitted bool
   258  
   259  	refs uint64
   260  
   261  	// memCgID is the memory cgroup id to which this page is committed.
   262  	memCgID uint32
   263  }
   264  
   265  // canCommit returns true if the tracked region can be committed.
   266  func (u *usageInfo) canCommit() bool {
   267  	// refs must be greater than 0 because we assume that reclaimable pages
   268  	// (that aren't already known to be committed) are not committed. This
   269  	// isn't necessarily true, even after the reclaimer does Decommit(),
   270  	// because the kernel may subsequently back the hugepage-sized region
   271  	// containing the decommitted page with a hugepage. However, it's
   272  	// consistent with our treatment of unallocated pages, which have the same
   273  	// property.
   274  	return !u.knownCommitted && u.refs != 0
   275  }
   276  
   277  // An EvictableMemoryUser represents a user of MemoryFile-allocated memory that
   278  // may be asked to deallocate that memory in the presence of memory pressure.
   279  type EvictableMemoryUser interface {
   280  	// Evict requests that the EvictableMemoryUser deallocate memory used by
   281  	// er, which was registered as evictable by a previous call to
   282  	// MemoryFile.MarkEvictable.
   283  	//
   284  	// Evict is not required to deallocate memory. In particular, since pgalloc
   285  	// must call Evict without holding locks to avoid circular lock ordering,
   286  	// it is possible that the passed range has already been marked as
   287  	// unevictable by a racing call to MemoryFile.MarkUnevictable.
   288  	// Implementations of EvictableMemoryUser must detect such races and handle
   289  	// them by making Evict have no effect on unevictable ranges.
   290  	//
   291  	// After a call to Evict, the MemoryFile will consider the evicted range
   292  	// unevictable (i.e. it will not call Evict on the same range again) until
   293  	// informed otherwise by a subsequent call to MarkEvictable.
   294  	Evict(ctx context.Context, er EvictableRange)
   295  }
   296  
   297  // An EvictableRange represents a range of uint64 offsets in an
   298  // EvictableMemoryUser.
   299  //
   300  // In practice, most EvictableMemoryUsers will probably be implementations of
   301  // memmap.Mappable, and EvictableRange therefore corresponds to
   302  // memmap.MappableRange. However, this package cannot depend on the memmap
   303  // package, since doing so would create a circular dependency.
   304  //
   305  // type EvictableRange <generated using go_generics>
   306  
   307  // evictableMemoryUserInfo is the value type of MemoryFile.evictable.
   308  type evictableMemoryUserInfo struct {
   309  	// ranges tracks all evictable ranges for the given user.
   310  	ranges evictableRangeSet
   311  
   312  	// If evicting is true, there is a goroutine currently evicting all
   313  	// evictable ranges for this user.
   314  	evicting bool
   315  }
   316  
   317  const (
   318  	chunkShift = 30
   319  	chunkSize  = 1 << chunkShift // 1 GB
   320  	chunkMask  = chunkSize - 1
   321  
   322  	// maxPage is the highest 64-bit page.
   323  	maxPage = math.MaxUint64 &^ (hostarch.PageSize - 1)
   324  )
   325  
   326  // NewMemoryFile creates a MemoryFile backed by the given file. If
   327  // NewMemoryFile succeeds, ownership of file is transferred to the returned
   328  // MemoryFile.
   329  func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) {
   330  	switch opts.DelayedEviction {
   331  	case DelayedEvictionDefault:
   332  		opts.DelayedEviction = DelayedEvictionEnabled
   333  	case DelayedEvictionDisabled, DelayedEvictionManual:
   334  		opts.UseHostMemcgPressure = false
   335  	case DelayedEvictionEnabled:
   336  		// ok
   337  	default:
   338  		return nil, fmt.Errorf("invalid MemoryFileOpts.DelayedEviction: %v", opts.DelayedEviction)
   339  	}
   340  
   341  	// Truncate the file to 0 bytes first to ensure that it's empty.
   342  	if err := file.Truncate(0); err != nil {
   343  		return nil, err
   344  	}
   345  	f := &MemoryFile{
   346  		opts:      opts,
   347  		file:      file,
   348  		evictable: make(map[EvictableMemoryUser]*evictableMemoryUserInfo),
   349  	}
   350  	f.mappings.Store(make([]uintptr, 0))
   351  	f.reclaimCond.L = &f.mu
   352  
   353  	if f.opts.DelayedEviction == DelayedEvictionEnabled && f.opts.UseHostMemcgPressure {
   354  		stop, err := hostmm.NotifyCurrentMemcgPressureCallback(func() {
   355  			f.mu.Lock()
   356  			startedAny := f.startEvictionsLocked()
   357  			f.mu.Unlock()
   358  			if startedAny {
   359  				log.Debugf("pgalloc.MemoryFile performing evictions due to memcg pressure")
   360  			}
   361  		}, "low")
   362  		if err != nil {
   363  			return nil, fmt.Errorf("failed to configure memcg pressure level notifications: %v", err)
   364  		}
   365  		f.stopNotifyPressure = stop
   366  	}
   367  
   368  	go f.runReclaim() // S/R-SAFE: f.mu
   369  
   370  	if !opts.DisableIMAWorkAround {
   371  		IMAWorkAroundForMemFile(file.Fd())
   372  	}
   373  	return f, nil
   374  }
   375  
   376  // IMAWorkAroundForMemFile works around IMA by immediately creating a temporary
   377  // PROT_EXEC mapping, while the backing file is still small. IMA will ignore
   378  // any future mappings.
   379  //
   380  // The Linux kernel contains an optional feature called "Integrity
   381  // Measurement Architecture" (IMA). If IMA is enabled, it will checksum
   382  // binaries the first time they are mapped PROT_EXEC. This is bad news for
   383  // executable pages mapped from our backing file, which can grow to
   384  // terabytes in (sparse) size. If IMA attempts to checksum a file that
   385  // large, it will allocate all of the sparse pages and quickly exhaust all
   386  // memory.
   387  func IMAWorkAroundForMemFile(fd uintptr) {
   388  	m, _, errno := unix.Syscall6(
   389  		unix.SYS_MMAP,
   390  		0,
   391  		hostarch.PageSize,
   392  		unix.PROT_EXEC,
   393  		unix.MAP_SHARED,
   394  		fd,
   395  		0)
   396  	if errno != 0 {
   397  		// This isn't fatal (IMA may not even be in use). Log the error, but
   398  		// don't return it.
   399  		log.Warningf("Failed to pre-map MemoryFile PROT_EXEC: %v", errno)
   400  	} else {
   401  		if _, _, errno := unix.Syscall(
   402  			unix.SYS_MUNMAP,
   403  			m,
   404  			hostarch.PageSize,
   405  			0); errno != 0 {
   406  			panic(fmt.Sprintf("failed to unmap PROT_EXEC MemoryFile mapping: %v", errno))
   407  		}
   408  	}
   409  }
   410  
   411  // Destroy releases all resources used by f.
   412  //
   413  // Preconditions: All pages allocated by f have been freed.
   414  //
   415  // Postconditions: None of f's methods may be called after Destroy.
   416  func (f *MemoryFile) Destroy() {
   417  	f.mu.Lock()
   418  	defer f.mu.Unlock()
   419  	f.destroyed = true
   420  	f.reclaimCond.Signal()
   421  }
   422  
   423  // AllocationMode provides a way to inform the pgalloc API how to allocate
   424  // memory and pages on the host.
   425  // A page will exist in one of the following incremental states:
   426  //  1. Allocated: A page is allocated if it was returned by Allocate() and its
   427  //     reference count hasn't dropped to 0 since then.
   428  //  2. Committed: As described in MemoryFile documentation above, a page is
   429  //     committed if the host kernel is spending resources to store its
   430  //     contents. A committed page is implicitly allocated.
   431  //  3. Populated: A page is populated for reading/writing in a page table
   432  //     hierarchy if it has a page table entry that permits reading/writing
   433  //     respectively. A populated page is implicitly committed, since the page
   434  //     table entry needs a physical page to point to, but not vice versa.
   435  type AllocationMode int
   436  
   437  const (
   438  	// AllocateOnly indicates that pages need to only be allocated.
   439  	AllocateOnly AllocationMode = iota
   440  	// AllocateAndCommit indicates that pages need to be committed, in addition
   441  	// to being allocated.
   442  	AllocateAndCommit
   443  	// AllocateAndWritePopulate indicates that writable pages should ideally be
   444  	// populated in the page table, in addition to being allocated. This is a
   445  	// suggestion, not a requirement.
   446  	AllocateAndWritePopulate
   447  )
   448  
   449  // AllocOpts are options used in MemoryFile.Allocate.
   450  type AllocOpts struct {
   451  	// Kind is the memory kind to be used for accounting.
   452  	Kind usage.MemoryKind
   453  	// Dir indicates the direction in which offsets are allocated.
   454  	Dir Direction
   455  	// MemCgID is the memory cgroup ID and the zero value indicates that
   456  	// the memory will not be accounted to any cgroup.
   457  	MemCgID uint32
   458  	// Mode allows the callers to select how the pages are allocated in the
   459  	// MemoryFile. Callers that will fill the allocated memory by writing to it
   460  	// should pass AllocateAndWritePopulate to avoid faulting page-by-page. Callers
   461  	// that will fill the allocated memory by invoking host system calls should
   462  	// pass AllocateOnly.
   463  	Mode AllocationMode
   464  	// If Reader is provided, the allocated memory is filled by calling
   465  	// ReadToBlocks() repeatedly until either length bytes are read or a non-nil
   466  	// error is returned. It returns the allocated memory, truncated down to the
   467  	// nearest page. If this is shorter than length bytes due to an error
   468  	// returned by ReadToBlocks(), it returns the partially filled fr and error.
   469  	Reader safemem.Reader
   470  }
   471  
   472  // Allocate returns a range of initially-zeroed pages of the given length with
   473  // the given accounting kind and a single reference held by the caller. When
   474  // the last reference on an allocated page is released, ownership of the page
   475  // is returned to the MemoryFile, allowing it to be returned by a future call
   476  // to Allocate.
   477  //
   478  // Preconditions: length must be page-aligned and non-zero.
   479  func (f *MemoryFile) Allocate(length uint64, opts AllocOpts) (memmap.FileRange, error) {
   480  	fr, err := f.allocate(length, &opts)
   481  	if err != nil {
   482  		return memmap.FileRange{}, err
   483  	}
   484  	var dsts safemem.BlockSeq
   485  	switch opts.Mode {
   486  	case AllocateOnly: // Allocation is handled above. Nothing more to do.
   487  	case AllocateAndCommit:
   488  		if err := f.commitFile(fr); err != nil {
   489  			f.DecRef(fr)
   490  			return memmap.FileRange{}, err
   491  		}
   492  	case AllocateAndWritePopulate:
   493  		dsts, err = f.MapInternal(fr, hostarch.Write)
   494  		if err != nil {
   495  			f.DecRef(fr)
   496  			return memmap.FileRange{}, err
   497  		}
   498  		if canPopulate() {
   499  			rem := dsts
   500  			for {
   501  				if !tryPopulate(rem.Head()) {
   502  					break
   503  				}
   504  				rem = rem.Tail()
   505  				if rem.IsEmpty() {
   506  					break
   507  				}
   508  			}
   509  		}
   510  	default:
   511  		panic(fmt.Sprintf("unknown allocation mode: %d", opts.Mode))
   512  	}
   513  	if opts.Reader != nil {
   514  		if dsts.IsEmpty() {
   515  			dsts, err = f.MapInternal(fr, hostarch.Write)
   516  			if err != nil {
   517  				f.DecRef(fr)
   518  				return memmap.FileRange{}, err
   519  			}
   520  		}
   521  		n, err := safemem.ReadFullToBlocks(opts.Reader, dsts)
   522  		un := uint64(hostarch.Addr(n).RoundDown())
   523  		if un < length {
   524  			// Free unused memory and update fr to contain only the memory that is
   525  			// still allocated.
   526  			f.DecRef(memmap.FileRange{fr.Start + un, fr.End})
   527  			fr.End = fr.Start + un
   528  		}
   529  		if err != nil {
   530  			return fr, err
   531  		}
   532  	}
   533  	return fr, nil
   534  }
   535  
   536  func (f *MemoryFile) allocate(length uint64, opts *AllocOpts) (memmap.FileRange, error) {
   537  	if length == 0 || length%hostarch.PageSize != 0 {
   538  		panic(fmt.Sprintf("invalid allocation length: %#x", length))
   539  	}
   540  
   541  	f.mu.Lock()
   542  	defer f.mu.Unlock()
   543  
   544  	// Align hugepage-and-larger allocations on hugepage boundaries to try
   545  	// to take advantage of hugetmpfs.
   546  	alignment := uint64(hostarch.PageSize)
   547  	if length >= hostarch.HugePageSize {
   548  		alignment = hostarch.HugePageSize
   549  	}
   550  
   551  	// Find a range in the underlying file.
   552  	fr, ok := f.findAvailableRange(length, alignment, opts.Dir)
   553  	if !ok {
   554  		return memmap.FileRange{}, linuxerr.ENOMEM
   555  	}
   556  
   557  	// Expand the file if needed.
   558  	if int64(fr.End) > f.fileSize {
   559  		// Round the new file size up to be chunk-aligned.
   560  		newFileSize := (int64(fr.End) + chunkMask) &^ chunkMask
   561  		if err := f.file.Truncate(newFileSize); err != nil {
   562  			return memmap.FileRange{}, err
   563  		}
   564  		f.fileSize = newFileSize
   565  		f.mappingsMu.Lock()
   566  		oldMappings := f.mappings.Load().([]uintptr)
   567  		newMappings := make([]uintptr, newFileSize>>chunkShift)
   568  		copy(newMappings, oldMappings)
   569  		f.mappings.Store(newMappings)
   570  		f.mappingsMu.Unlock()
   571  	}
   572  
   573  	if f.opts.ManualZeroing {
   574  		if err := f.manuallyZero(fr); err != nil {
   575  			return memmap.FileRange{}, err
   576  		}
   577  	}
   578  	// Mark selected pages as in use.
   579  	if !f.usage.Add(fr, usageInfo{
   580  		kind:    opts.Kind,
   581  		refs:    1,
   582  		memCgID: opts.MemCgID,
   583  	}) {
   584  		panic(fmt.Sprintf("allocating %v: failed to insert into usage set:\n%v", fr, &f.usage))
   585  	}
   586  
   587  	return fr, nil
   588  }
   589  
   590  // findAvailableRange returns an available range in the usageSet.
   591  //
   592  // Note that scanning for available slots takes place from end first backwards,
   593  // then forwards. This heuristic has important consequence for how sequential
   594  // mappings can be merged in the host VMAs, given that addresses for both
   595  // application and sentry mappings are allocated top-down (from higher to
   596  // lower addresses). The file is also grown exponentially in order to create
   597  // space for mappings to be allocated downwards.
   598  //
   599  // Precondition: alignment must be a power of 2.
   600  func (f *MemoryFile) findAvailableRange(length, alignment uint64, dir Direction) (memmap.FileRange, bool) {
   601  	if dir == BottomUp {
   602  		return findAvailableRangeBottomUp(&f.usage, length, alignment)
   603  	}
   604  	return findAvailableRangeTopDown(&f.usage, f.fileSize, length, alignment)
   605  }
   606  
   607  func findAvailableRangeTopDown(usage *usageSet, fileSize int64, length, alignment uint64) (memmap.FileRange, bool) {
   608  	alignmentMask := alignment - 1
   609  
   610  	// Search for space in existing gaps, starting at the current end of the
   611  	// file and working backward.
   612  	lastGap := usage.LastGap()
   613  	gap := lastGap
   614  	for {
   615  		end := gap.End()
   616  		if end > uint64(fileSize) {
   617  			end = uint64(fileSize)
   618  		}
   619  
   620  		// Try to allocate from the end of this gap, with the start of the
   621  		// allocated range aligned down to alignment.
   622  		unalignedStart := end - length
   623  		if unalignedStart > end {
   624  			// Negative overflow: this and all preceding gaps are too small to
   625  			// accommodate length.
   626  			break
   627  		}
   628  		if start := unalignedStart &^ alignmentMask; start >= gap.Start() {
   629  			return memmap.FileRange{start, start + length}, true
   630  		}
   631  
   632  		gap = gap.PrevLargeEnoughGap(length)
   633  		if !gap.Ok() {
   634  			break
   635  		}
   636  	}
   637  
   638  	// Check that it's possible to fit this allocation at the end of a file of any size.
   639  	min := lastGap.Start()
   640  	min = (min + alignmentMask) &^ alignmentMask
   641  	if min+length < min {
   642  		// Overflow: allocation would exceed the range of uint64.
   643  		return memmap.FileRange{}, false
   644  	}
   645  
   646  	// Determine the minimum file size required to fit this allocation at its end.
   647  	for {
   648  		newFileSize := 2 * fileSize
   649  		if newFileSize <= fileSize {
   650  			if fileSize != 0 {
   651  				// Overflow: allocation would exceed the range of int64.
   652  				return memmap.FileRange{}, false
   653  			}
   654  			newFileSize = chunkSize
   655  		}
   656  		fileSize = newFileSize
   657  
   658  		unalignedStart := uint64(fileSize) - length
   659  		if unalignedStart > uint64(fileSize) {
   660  			// Negative overflow: fileSize is still inadequate.
   661  			continue
   662  		}
   663  		if start := unalignedStart &^ alignmentMask; start >= min {
   664  			return memmap.FileRange{start, start + length}, true
   665  		}
   666  	}
   667  }
   668  
   669  func findAvailableRangeBottomUp(usage *usageSet, length, alignment uint64) (memmap.FileRange, bool) {
   670  	alignmentMask := alignment - 1
   671  	for gap := usage.FirstGap(); gap.Ok(); gap = gap.NextLargeEnoughGap(length) {
   672  		// Align the start address and check if allocation still fits in the gap.
   673  		start := (gap.Start() + alignmentMask) &^ alignmentMask
   674  
   675  		// File offsets are int64s. Since length must be strictly positive, end
   676  		// cannot legitimately be 0.
   677  		end := start + length
   678  		if end < start || int64(end) <= 0 {
   679  			return memmap.FileRange{}, false
   680  		}
   681  		if end <= gap.End() {
   682  			return memmap.FileRange{start, end}, true
   683  		}
   684  	}
   685  
   686  	// NextLargeEnoughGap should have returned a gap at the end.
   687  	panic(fmt.Sprintf("NextLargeEnoughGap didn't return a gap at the end, length: %d", length))
   688  }
   689  
   690  var mlockDisabled atomicbitops.Uint32
   691  var madvPopulateWriteDisabled atomicbitops.Uint32
   692  
   693  func canPopulate() bool {
   694  	return mlockDisabled.Load() == 0 || madvPopulateWriteDisabled.Load() == 0
   695  }
   696  
   697  func tryPopulateMadv(b safemem.Block) bool {
   698  	if madvPopulateWriteDisabled.Load() != 0 {
   699  		return false
   700  	}
   701  	start, ok := hostarch.Addr(b.Addr()).RoundUp()
   702  	if !ok {
   703  		return true
   704  	}
   705  	end := hostarch.Addr(b.Addr() + uintptr(b.Len())).RoundDown()
   706  	bLen := end - start
   707  	// Only call madvise(MADV_POPULATE_WRITE) if >=2 pages are being populated.
   708  	// 1 syscall overhead >= 1 page fault overhead. This is because syscalls are
   709  	// susceptible to additional overheads like seccomp-bpf filters and auditing.
   710  	if start >= end || bLen <= hostarch.PageSize {
   711  		return true
   712  	}
   713  	_, _, errno := unix.RawSyscall(unix.SYS_MADVISE, uintptr(start), uintptr(bLen), unix.MADV_POPULATE_WRITE)
   714  	if errno != 0 {
   715  		if errno == unix.EINVAL {
   716  			// EINVAL is expected if MADV_POPULATE_WRITE is not supported (Linux <5.14).
   717  			log.Infof("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: madvise failed: %s", errno)
   718  		} else {
   719  			log.Warningf("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: madvise failed: %s", errno)
   720  		}
   721  		madvPopulateWriteDisabled.Store(1)
   722  		return false
   723  	}
   724  	return true
   725  }
   726  
   727  func tryPopulateMlock(b safemem.Block) bool {
   728  	if mlockDisabled.Load() != 0 {
   729  		return false
   730  	}
   731  	// Call mlock to populate pages, then munlock to cancel the mlock (but keep
   732  	// the pages populated). Only do so for hugepage-aligned address ranges to
   733  	// ensure that splitting the VMA in mlock doesn't split any existing
   734  	// hugepages. This assumes that two host syscalls, plus the MM overhead of
   735  	// mlock + munlock, is faster on average than trapping for
   736  	// HugePageSize/PageSize small page faults.
   737  	start, ok := hostarch.Addr(b.Addr()).HugeRoundUp()
   738  	if !ok {
   739  		return true
   740  	}
   741  	end := hostarch.Addr(b.Addr() + uintptr(b.Len())).HugeRoundDown()
   742  	if start >= end {
   743  		return true
   744  	}
   745  	_, _, errno := unix.Syscall(unix.SYS_MLOCK, uintptr(start), uintptr(end-start), 0)
   746  	unix.RawSyscall(unix.SYS_MUNLOCK, uintptr(start), uintptr(end-start), 0)
   747  	if errno != 0 {
   748  		if errno == unix.ENOMEM || errno == unix.EPERM {
   749  			// These errors are expected from hitting non-zero RLIMIT_MEMLOCK, or
   750  			// hitting zero RLIMIT_MEMLOCK without CAP_IPC_LOCK, respectively.
   751  			log.Infof("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: mlock failed: %s", errno)
   752  		} else {
   753  			log.Warningf("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: mlock failed: %s", errno)
   754  		}
   755  		mlockDisabled.Store(1)
   756  		return false
   757  	}
   758  	return true
   759  }
   760  
   761  func tryPopulate(b safemem.Block) bool {
   762  	// There are two approaches for populating writable pages:
   763  	// 1. madvise(MADV_POPULATE_WRITE). It has the desired effect: "Populate
   764  	//    (prefault) page tables writable, faulting in all pages in the range
   765  	//    just as if manually writing to each each page".
   766  	// 2. Call mlock to populate pages, then munlock to cancel the mlock (but
   767  	//    keep the pages populated).
   768  	//
   769  	// Prefer the madvise(MADV_POPULATE_WRITE) approach because:
   770  	// - Only requires 1 syscall, as opposed to 2 syscalls with mlock approach.
   771  	// - It is faster because it doesn't have to modify vmas like mlock does.
   772  	// - It works for disk-backed memory mappings too. The mlock approach doesn't
   773  	//   work for disk-backed filesystems (e.g. ext4). This is because
   774  	//   mlock(2) => mm/gup.c:__mm_populate() emulates a read fault on writable
   775  	//   MAP_SHARED mappings. For memory-backed (shmem) files,
   776  	//   mm/mmap.c:vma_set_page_prot() => vma_wants_writenotify() is false, so
   777  	//   the page table entries populated by a read fault are writable. For
   778  	//   disk-backed files, vma_set_page_prot() => vma_wants_writenotify() is
   779  	//   true, so the page table entries populated by a read fault are read-only.
   780  	if tryPopulateMadv(b) {
   781  		return true
   782  	}
   783  	return tryPopulateMlock(b)
   784  }
   785  
   786  // fallocate(2) modes, defined in Linux's include/uapi/linux/falloc.h.
   787  const (
   788  	_FALLOC_FL_KEEP_SIZE  = 1
   789  	_FALLOC_FL_PUNCH_HOLE = 2
   790  )
   791  
   792  // Decommit releases resources associated with maintaining the contents of the
   793  // given pages. If Decommit succeeds, future accesses of the decommitted pages
   794  // will read zeroes.
   795  //
   796  // Preconditions: fr.Length() > 0.
   797  func (f *MemoryFile) Decommit(fr memmap.FileRange) error {
   798  	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 {
   799  		panic(fmt.Sprintf("invalid range: %v", fr))
   800  	}
   801  
   802  	if f.opts.ManualZeroing {
   803  		// FALLOC_FL_PUNCH_HOLE may not zero pages if ManualZeroing is in
   804  		// effect.
   805  		if err := f.manuallyZero(fr); err != nil {
   806  			return err
   807  		}
   808  	} else {
   809  		if err := f.decommitFile(fr); err != nil {
   810  			return err
   811  		}
   812  	}
   813  
   814  	f.markDecommitted(fr)
   815  	return nil
   816  }
   817  
   818  func (f *MemoryFile) manuallyZero(fr memmap.FileRange) error {
   819  	return f.forEachMappingSlice(fr, func(bs []byte) {
   820  		for i := range bs {
   821  			bs[i] = 0
   822  		}
   823  	})
   824  }
   825  
   826  func (f *MemoryFile) commitFile(fr memmap.FileRange) error {
   827  	// "The default operation (i.e., mode is zero) of fallocate() allocates the
   828  	// disk space within the range specified by offset and len." - fallocate(2)
   829  	return unix.Fallocate(
   830  		int(f.file.Fd()),
   831  		0, // mode
   832  		int64(fr.Start),
   833  		int64(fr.Length()))
   834  }
   835  
   836  func (f *MemoryFile) decommitFile(fr memmap.FileRange) error {
   837  	// "After a successful call, subsequent reads from this range will
   838  	// return zeroes. The FALLOC_FL_PUNCH_HOLE flag must be ORed with
   839  	// FALLOC_FL_KEEP_SIZE in mode ..." - fallocate(2)
   840  	return unix.Fallocate(
   841  		int(f.file.Fd()),
   842  		_FALLOC_FL_PUNCH_HOLE|_FALLOC_FL_KEEP_SIZE,
   843  		int64(fr.Start),
   844  		int64(fr.Length()))
   845  }
   846  
   847  func (f *MemoryFile) markDecommitted(fr memmap.FileRange) {
   848  	f.mu.Lock()
   849  	defer f.mu.Unlock()
   850  	// Since we're changing the knownCommitted attribute, we need to merge
   851  	// across the entire range to ensure that the usage tree is minimal.
   852  	gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) {
   853  		val := seg.ValuePtr()
   854  		if val.knownCommitted {
   855  			// Drop the usageExpected appropriately.
   856  			amount := seg.Range().Length()
   857  			usage.MemoryAccounting.Dec(amount, val.kind, val.memCgID)
   858  			f.usageExpected -= amount
   859  			val.knownCommitted = false
   860  		}
   861  		val.memCgID = 0
   862  	})
   863  	if gap.Ok() {
   864  		panic(fmt.Sprintf("Decommit(%v): attempted to decommit unallocated pages %v:\n%v", fr, gap.Range(), &f.usage))
   865  	}
   866  	f.usage.MergeRange(fr)
   867  }
   868  
   869  // IncRef implements memmap.File.IncRef.
   870  func (f *MemoryFile) IncRef(fr memmap.FileRange, memCgID uint32) {
   871  	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 {
   872  		panic(fmt.Sprintf("invalid range: %v", fr))
   873  	}
   874  
   875  	f.mu.Lock()
   876  	defer f.mu.Unlock()
   877  
   878  	gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) {
   879  		seg.ValuePtr().refs++
   880  	})
   881  	if gap.Ok() {
   882  		panic(fmt.Sprintf("IncRef(%v): attempted to IncRef on unallocated pages %v:\n%v", fr, gap.Range(), &f.usage))
   883  	}
   884  
   885  	f.usage.MergeAdjacent(fr)
   886  }
   887  
   888  // DecRef implements memmap.File.DecRef.
   889  func (f *MemoryFile) DecRef(fr memmap.FileRange) {
   890  	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 {
   891  		panic(fmt.Sprintf("invalid range: %v", fr))
   892  	}
   893  
   894  	var freed bool
   895  
   896  	f.mu.Lock()
   897  	defer f.mu.Unlock()
   898  
   899  	for seg := f.usage.FindSegment(fr.Start); seg.Ok() && seg.Start() < fr.End; seg = seg.NextSegment() {
   900  		seg = f.usage.Isolate(seg, fr)
   901  		val := seg.ValuePtr()
   902  		if val.refs == 0 {
   903  			panic(fmt.Sprintf("DecRef(%v): 0 existing references on %v:\n%v", fr, seg.Range(), &f.usage))
   904  		}
   905  		val.refs--
   906  		if val.refs == 0 {
   907  			f.reclaim.Add(seg.Range(), reclaimSetValue{})
   908  			freed = true
   909  			// Reclassify memory as System, until it's freed by the reclaim
   910  			// goroutine.
   911  			if val.knownCommitted {
   912  				usage.MemoryAccounting.Move(seg.Range().Length(), usage.System, val.kind, val.memCgID)
   913  			}
   914  			val.kind = usage.System
   915  		}
   916  	}
   917  	f.usage.MergeAdjacent(fr)
   918  
   919  	if freed {
   920  		f.reclaimable = true
   921  		f.reclaimCond.Signal()
   922  	}
   923  }
   924  
   925  // MapInternal implements memmap.File.MapInternal.
   926  func (f *MemoryFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) {
   927  	if !fr.WellFormed() || fr.Length() == 0 {
   928  		panic(fmt.Sprintf("invalid range: %v", fr))
   929  	}
   930  	if at.Execute {
   931  		return safemem.BlockSeq{}, linuxerr.EACCES
   932  	}
   933  
   934  	chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift)
   935  	if chunks == 1 {
   936  		// Avoid an unnecessary slice allocation.
   937  		var seq safemem.BlockSeq
   938  		err := f.forEachMappingSlice(fr, func(bs []byte) {
   939  			seq = safemem.BlockSeqOf(safemem.BlockFromSafeSlice(bs))
   940  		})
   941  		return seq, err
   942  	}
   943  	blocks := make([]safemem.Block, 0, chunks)
   944  	err := f.forEachMappingSlice(fr, func(bs []byte) {
   945  		blocks = append(blocks, safemem.BlockFromSafeSlice(bs))
   946  	})
   947  	return safemem.BlockSeqFromSlice(blocks), err
   948  }
   949  
   950  // forEachMappingSlice invokes fn on a sequence of byte slices that
   951  // collectively map all bytes in fr.
   952  func (f *MemoryFile) forEachMappingSlice(fr memmap.FileRange, fn func([]byte)) error {
   953  	mappings := f.mappings.Load().([]uintptr)
   954  	for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize {
   955  		chunk := int(chunkStart >> chunkShift)
   956  		m := atomic.LoadUintptr(&mappings[chunk])
   957  		if m == 0 {
   958  			var err error
   959  			mappings, m, err = f.getChunkMapping(chunk)
   960  			if err != nil {
   961  				return err
   962  			}
   963  		}
   964  		startOff := uint64(0)
   965  		if chunkStart < fr.Start {
   966  			startOff = fr.Start - chunkStart
   967  		}
   968  		endOff := uint64(chunkSize)
   969  		if chunkStart+chunkSize > fr.End {
   970  			endOff = fr.End - chunkStart
   971  		}
   972  		fn(unsafeSlice(m, chunkSize)[startOff:endOff])
   973  	}
   974  	return nil
   975  }
   976  
   977  func (f *MemoryFile) getChunkMapping(chunk int) ([]uintptr, uintptr, error) {
   978  	f.mappingsMu.Lock()
   979  	defer f.mappingsMu.Unlock()
   980  	// Another thread may have replaced f.mappings altogether due to file
   981  	// expansion.
   982  	mappings := f.mappings.Load().([]uintptr)
   983  	// Another thread may have already mapped the chunk.
   984  	if m := mappings[chunk]; m != 0 {
   985  		return mappings, m, nil
   986  	}
   987  	m, _, errno := unix.Syscall6(
   988  		unix.SYS_MMAP,
   989  		0,
   990  		chunkSize,
   991  		unix.PROT_READ|unix.PROT_WRITE,
   992  		unix.MAP_SHARED,
   993  		f.file.Fd(),
   994  		uintptr(chunk<<chunkShift))
   995  	if errno != 0 {
   996  		return nil, 0, errno
   997  	}
   998  	atomic.StoreUintptr(&mappings[chunk], m)
   999  	return mappings, m, nil
  1000  }
  1001  
  1002  // MarkEvictable allows f to request memory deallocation by calling
  1003  // user.Evict(er) in the future.
  1004  //
  1005  // Redundantly marking an already-evictable range as evictable has no effect.
  1006  func (f *MemoryFile) MarkEvictable(user EvictableMemoryUser, er EvictableRange) {
  1007  	f.mu.Lock()
  1008  	defer f.mu.Unlock()
  1009  	info, ok := f.evictable[user]
  1010  	if !ok {
  1011  		info = &evictableMemoryUserInfo{}
  1012  		f.evictable[user] = info
  1013  	}
  1014  	gap := info.ranges.LowerBoundGap(er.Start)
  1015  	for gap.Ok() && gap.Start() < er.End {
  1016  		gapER := gap.Range().Intersect(er)
  1017  		if gapER.Length() == 0 {
  1018  			gap = gap.NextGap()
  1019  			continue
  1020  		}
  1021  		gap = info.ranges.Insert(gap, gapER, evictableRangeSetValue{}).NextGap()
  1022  	}
  1023  	if !info.evicting {
  1024  		switch f.opts.DelayedEviction {
  1025  		case DelayedEvictionDisabled:
  1026  			// Kick off eviction immediately.
  1027  			f.startEvictionGoroutineLocked(user, info)
  1028  		case DelayedEvictionEnabled:
  1029  			if !f.opts.UseHostMemcgPressure {
  1030  				// Ensure that the reclaimer goroutine is running, so that it
  1031  				// can start eviction when necessary.
  1032  				f.reclaimCond.Signal()
  1033  			}
  1034  		}
  1035  	}
  1036  }
  1037  
  1038  // MarkUnevictable informs f that user no longer considers er to be evictable,
  1039  // so the MemoryFile should no longer call user.Evict(er). Note that, per
  1040  // EvictableMemoryUser.Evict's documentation, user.Evict(er) may still be
  1041  // called even after MarkUnevictable returns due to race conditions, and
  1042  // implementations of EvictableMemoryUser must handle this possibility.
  1043  //
  1044  // Redundantly marking an already-unevictable range as unevictable has no
  1045  // effect.
  1046  func (f *MemoryFile) MarkUnevictable(user EvictableMemoryUser, er EvictableRange) {
  1047  	f.mu.Lock()
  1048  	defer f.mu.Unlock()
  1049  	info, ok := f.evictable[user]
  1050  	if !ok {
  1051  		return
  1052  	}
  1053  	seg := info.ranges.LowerBoundSegment(er.Start)
  1054  	for seg.Ok() && seg.Start() < er.End {
  1055  		seg = info.ranges.Isolate(seg, er)
  1056  		seg = info.ranges.Remove(seg).NextSegment()
  1057  	}
  1058  	// We can only remove info if there's no eviction goroutine running on its
  1059  	// behalf.
  1060  	if !info.evicting && info.ranges.IsEmpty() {
  1061  		delete(f.evictable, user)
  1062  	}
  1063  }
  1064  
  1065  // MarkAllUnevictable informs f that user no longer considers any offsets to be
  1066  // evictable. It otherwise has the same semantics as MarkUnevictable.
  1067  func (f *MemoryFile) MarkAllUnevictable(user EvictableMemoryUser) {
  1068  	f.mu.Lock()
  1069  	defer f.mu.Unlock()
  1070  	info, ok := f.evictable[user]
  1071  	if !ok {
  1072  		return
  1073  	}
  1074  	info.ranges.RemoveAll()
  1075  	// We can only remove info if there's no eviction goroutine running on its
  1076  	// behalf.
  1077  	if !info.evicting {
  1078  		delete(f.evictable, user)
  1079  	}
  1080  }
  1081  
  1082  // ShouldCacheEvictable returns true if f is meaningfully delaying evictions of
  1083  // evictable memory, such that it may be advantageous to cache data in
  1084  // evictable memory. The value returned by ShouldCacheEvictable may change
  1085  // between calls.
  1086  func (f *MemoryFile) ShouldCacheEvictable() bool {
  1087  	return f.opts.DelayedEviction == DelayedEvictionManual || f.opts.UseHostMemcgPressure
  1088  }
  1089  
  1090  // UpdateUsage ensures that the memory usage statistics in
  1091  // usage.MemoryAccounting are up to date. If forceScan is true, the
  1092  // UsageScanDuration is ignored and the memory file is scanned to get the
  1093  // memory usage.
  1094  func (f *MemoryFile) UpdateUsage(memCgID uint32) error {
  1095  	f.mu.Lock()
  1096  	defer f.mu.Unlock()
  1097  
  1098  	// If the underlying usage matches where the usage tree already
  1099  	// represents, then we can just avoid the entire scan (we know it's
  1100  	// accurate).
  1101  	currentUsage, err := f.TotalUsage()
  1102  	if err != nil {
  1103  		return err
  1104  	}
  1105  	if currentUsage == f.usageExpected && f.usageSwapped == 0 {
  1106  		log.Debugf("UpdateUsage: skipped with usageSwapped=0.")
  1107  		return nil
  1108  	}
  1109  	// If the current usage matches the expected but there's swap
  1110  	// accounting, then ensure a scan takes place at least every second
  1111  	// (when requested).
  1112  	if currentUsage == f.usageExpected+f.usageSwapped && time.Now().Before(f.usageLast.Add(time.Second)) {
  1113  		log.Debugf("UpdateUsage: skipped with usageSwapped!=0.")
  1114  		return nil
  1115  	}
  1116  
  1117  	// Linux updates usage values at CONFIG_HZ.
  1118  	if scanningAfter := time.Now().Sub(f.usageLast).Milliseconds(); scanningAfter < time.Second.Milliseconds()/linux.CLOCKS_PER_SEC {
  1119  		log.Debugf("UpdateUsage: skipped because previous scan happened %d ms back", scanningAfter)
  1120  		return nil
  1121  	}
  1122  
  1123  	if memCgID == 0 {
  1124  		f.usageLast = time.Now()
  1125  	}
  1126  	err = f.updateUsageLocked(currentUsage, memCgID, mincore)
  1127  	log.Debugf("UpdateUsage: currentUsage=%d, usageExpected=%d, usageSwapped=%d.",
  1128  		currentUsage, f.usageExpected, f.usageSwapped)
  1129  	log.Debugf("UpdateUsage: took %v.", time.Since(f.usageLast))
  1130  	return err
  1131  }
  1132  
  1133  // updateUsageLocked attempts to detect commitment of previous-uncommitted
  1134  // pages by invoking checkCommitted, which is a function that, for each page i
  1135  // in bs, sets committed[i] to 1 if the page is committed and 0 otherwise.
  1136  //
  1137  // Precondition: f.mu must be held; it may be unlocked and reacquired.
  1138  // +checklocks:f.mu
  1139  func (f *MemoryFile) updateUsageLocked(currentUsage uint64, memCgID uint32, checkCommitted func(bs []byte, committed []byte) error) error {
  1140  	// Track if anything changed to elide the merge. In the common case, we
  1141  	// expect all segments to be committed and no merge to occur.
  1142  	changedAny := false
  1143  	defer func() {
  1144  		if changedAny {
  1145  			f.usage.MergeAll()
  1146  		}
  1147  
  1148  		// Adjust the swap usage to reflect reality.
  1149  		if f.usageExpected < currentUsage {
  1150  			// Since no pages may be marked decommitted while we hold mu, we
  1151  			// know that usage may have only increased since we got the last
  1152  			// current usage. Therefore, if usageExpected is still short of
  1153  			// currentUsage, we must assume that the difference is in pages
  1154  			// that have been swapped.
  1155  			newUsageSwapped := currentUsage - f.usageExpected
  1156  			if f.usageSwapped < newUsageSwapped {
  1157  				usage.MemoryAccounting.Inc(newUsageSwapped-f.usageSwapped, usage.System, 0)
  1158  			} else {
  1159  				usage.MemoryAccounting.Dec(f.usageSwapped-newUsageSwapped, usage.System, 0)
  1160  			}
  1161  			f.usageSwapped = newUsageSwapped
  1162  		} else if f.usageSwapped != 0 {
  1163  			// We have more usage accounted for than the file itself.
  1164  			// That's fine, we probably caught a race where pages were
  1165  			// being committed while the below loop was running. Just
  1166  			// report the higher number that we found and ignore swap.
  1167  			usage.MemoryAccounting.Dec(f.usageSwapped, usage.System, 0)
  1168  			f.usageSwapped = 0
  1169  		}
  1170  	}()
  1171  
  1172  	// Reused mincore buffer, will generally be <= 4096 bytes.
  1173  	var buf []byte
  1174  
  1175  	// Iterate over all usage data. There will only be usage segments
  1176  	// present when there is an associated reference.
  1177  	for seg := f.usage.FirstSegment(); seg.Ok(); {
  1178  		if !seg.ValuePtr().canCommit() {
  1179  			seg = seg.NextSegment()
  1180  			continue
  1181  		}
  1182  
  1183  		// Scan the pages of the given memCgID only. This will avoid scanning the
  1184  		// whole memory file when the memory usage is required only for a specific
  1185  		// cgroup. The total memory usage of all cgroups can be obtained when the
  1186  		// memCgID is passed as zero.
  1187  		if memCgID != 0 && seg.ValuePtr().memCgID != memCgID {
  1188  			seg = seg.NextSegment()
  1189  			continue
  1190  		}
  1191  
  1192  		// Get the range for this segment. As we touch slices, the
  1193  		// Start value will be walked along.
  1194  		r := seg.Range()
  1195  
  1196  		var checkErr error
  1197  		err := f.forEachMappingSlice(r,
  1198  			func(s []byte) {
  1199  				if checkErr != nil {
  1200  					return
  1201  				}
  1202  
  1203  				// Ensure that we have sufficient buffer for the call
  1204  				// (one byte per page). The length of each slice must
  1205  				// be page-aligned.
  1206  				bufLen := len(s) / hostarch.PageSize
  1207  				if len(buf) < bufLen {
  1208  					buf = make([]byte, bufLen)
  1209  				}
  1210  
  1211  				// Query for new pages in core.
  1212  				// NOTE(b/165896008): mincore (which is passed as checkCommitted)
  1213  				// by f.UpdateUsage() might take a really long time. So unlock f.mu
  1214  				// while checkCommitted runs.
  1215  				f.mu.Unlock() // +checklocksforce
  1216  				err := checkCommitted(s, buf)
  1217  				f.mu.Lock()
  1218  				if err != nil {
  1219  					checkErr = err
  1220  					return
  1221  				}
  1222  
  1223  				// Scan each page and switch out segments.
  1224  				seg := f.usage.LowerBoundSegment(r.Start)
  1225  				for i := 0; i < bufLen; {
  1226  					if buf[i]&0x1 == 0 {
  1227  						i++
  1228  						continue
  1229  					}
  1230  					// Scan to the end of this committed range.
  1231  					j := i + 1
  1232  					for ; j < bufLen; j++ {
  1233  						if buf[j]&0x1 == 0 {
  1234  							break
  1235  						}
  1236  					}
  1237  					committedFR := memmap.FileRange{
  1238  						Start: r.Start + uint64(i*hostarch.PageSize),
  1239  						End:   r.Start + uint64(j*hostarch.PageSize),
  1240  					}
  1241  					// Advance seg to committedFR.Start.
  1242  					for seg.Ok() && seg.End() < committedFR.Start {
  1243  						seg = seg.NextSegment()
  1244  					}
  1245  					// Mark pages overlapping committedFR as committed.
  1246  					for seg.Ok() && seg.Start() < committedFR.End {
  1247  						if seg.ValuePtr().canCommit() {
  1248  							seg = f.usage.Isolate(seg, committedFR)
  1249  							seg.ValuePtr().knownCommitted = true
  1250  							amount := seg.Range().Length()
  1251  							usage.MemoryAccounting.Inc(amount, seg.ValuePtr().kind, seg.ValuePtr().memCgID)
  1252  							f.usageExpected += amount
  1253  							changedAny = true
  1254  						}
  1255  						seg = seg.NextSegment()
  1256  					}
  1257  					// Continue scanning for committed pages.
  1258  					i = j + 1
  1259  				}
  1260  
  1261  				// Advance r.Start.
  1262  				r.Start += uint64(len(s))
  1263  			})
  1264  		if checkErr != nil {
  1265  			return checkErr
  1266  		}
  1267  		if err != nil {
  1268  			return err
  1269  		}
  1270  
  1271  		// Continue with the first segment after r.End.
  1272  		seg = f.usage.LowerBoundSegment(r.End)
  1273  	}
  1274  
  1275  	return nil
  1276  }
  1277  
  1278  // TotalUsage returns an aggregate usage for all memory statistics except
  1279  // Mapped (which is external to MemoryFile). This is generally much cheaper
  1280  // than UpdateUsage, but will not provide a fine-grained breakdown.
  1281  func (f *MemoryFile) TotalUsage() (uint64, error) {
  1282  	// Stat the underlying file to discover the underlying usage. stat(2)
  1283  	// always reports the allocated block count in units of 512 bytes. This
  1284  	// includes pages in the page cache and swapped pages.
  1285  	var stat unix.Stat_t
  1286  	if err := unix.Fstat(int(f.file.Fd()), &stat); err != nil {
  1287  		return 0, err
  1288  	}
  1289  	return uint64(stat.Blocks * 512), nil
  1290  }
  1291  
  1292  // TotalSize returns the current size of the backing file in bytes, which is an
  1293  // upper bound on the amount of memory that can currently be allocated from the
  1294  // MemoryFile. The value returned by TotalSize is permitted to change.
  1295  func (f *MemoryFile) TotalSize() uint64 {
  1296  	f.mu.Lock()
  1297  	defer f.mu.Unlock()
  1298  	return uint64(f.fileSize)
  1299  }
  1300  
  1301  // File returns the backing file.
  1302  func (f *MemoryFile) File() *os.File {
  1303  	return f.file
  1304  }
  1305  
  1306  // FD implements memmap.File.FD.
  1307  func (f *MemoryFile) FD() int {
  1308  	return int(f.file.Fd())
  1309  }
  1310  
  1311  // IsDiskBacked returns true if f is backed by a file on disk.
  1312  func (f *MemoryFile) IsDiskBacked() bool {
  1313  	return f.opts.DiskBackedFile
  1314  }
  1315  
  1316  // String implements fmt.Stringer.String.
  1317  //
  1318  // Note that because f.String locks f.mu, calling f.String internally
  1319  // (including indirectly through the fmt package) risks recursive locking.
  1320  // Within the pgalloc package, use f.usage directly instead.
  1321  func (f *MemoryFile) String() string {
  1322  	f.mu.Lock()
  1323  	defer f.mu.Unlock()
  1324  	return f.usage.String()
  1325  }
  1326  
  1327  // runReclaim implements the reclaimer goroutine, which continuously decommits
  1328  // reclaimable pages in order to reduce memory usage and make them available
  1329  // for allocation.
  1330  func (f *MemoryFile) runReclaim() {
  1331  	for {
  1332  		// N.B. We must call f.markReclaimed on the returned FrameRange.
  1333  		fr, ok := f.findReclaimable()
  1334  		if !ok {
  1335  			break
  1336  		}
  1337  
  1338  		if f.opts.ManualZeroing {
  1339  			// If ManualZeroing is in effect, only hugepage-aligned regions may
  1340  			// be safely passed to decommitFile. Pages will be zeroed on
  1341  			// reallocation, so we don't need to perform any manual zeroing
  1342  			// here, whether or not decommitFile succeeds.
  1343  			if startAddr, ok := hostarch.Addr(fr.Start).HugeRoundUp(); ok {
  1344  				if endAddr := hostarch.Addr(fr.End).HugeRoundDown(); startAddr < endAddr {
  1345  					decommitFR := memmap.FileRange{uint64(startAddr), uint64(endAddr)}
  1346  					if err := f.decommitFile(decommitFR); err != nil {
  1347  						log.Warningf("Reclaim failed to decommit %v: %v", decommitFR, err)
  1348  					}
  1349  				}
  1350  			}
  1351  		} else {
  1352  			if err := f.decommitFile(fr); err != nil {
  1353  				log.Warningf("Reclaim failed to decommit %v: %v", fr, err)
  1354  				// Zero the pages manually. This won't reduce memory usage, but at
  1355  				// least ensures that the pages will be zero when reallocated.
  1356  				if err := f.manuallyZero(fr); err != nil {
  1357  					panic(fmt.Sprintf("Reclaim failed to decommit or zero %v: %v", fr, err))
  1358  				}
  1359  			}
  1360  		}
  1361  		f.markDecommitted(fr)
  1362  		f.markReclaimed(fr)
  1363  	}
  1364  
  1365  	// We only get here if findReclaimable finds f.destroyed set and returns
  1366  	// false.
  1367  	f.mu.Lock()
  1368  	if !f.destroyed {
  1369  		f.mu.Unlock()
  1370  		panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set")
  1371  	}
  1372  	if f.opts.DecommitOnDestroy && f.fileSize > 0 {
  1373  		if err := f.decommitFile(memmap.FileRange{Start: 0, End: uint64(f.fileSize)}); err != nil {
  1374  			f.mu.Unlock()
  1375  			panic(fmt.Sprintf("failed to decommit entire memory file during destruction: %v", err))
  1376  		}
  1377  	}
  1378  	f.file.Close()
  1379  	// Ensure that any attempts to use f.file.Fd() fail instead of getting a fd
  1380  	// that has possibly been reassigned.
  1381  	f.file = nil
  1382  	f.mappingsMu.Lock()
  1383  	defer f.mappingsMu.Unlock()
  1384  	mappings := f.mappings.Load().([]uintptr)
  1385  	for i, m := range mappings {
  1386  		if m != 0 {
  1387  			_, _, errno := unix.Syscall(unix.SYS_MUNMAP, m, chunkSize, 0)
  1388  			if errno != 0 {
  1389  				log.Warningf("Failed to unmap mapping %#x for MemoryFile chunk %d: %v", m, i, errno)
  1390  			}
  1391  		}
  1392  	}
  1393  	// Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.)
  1394  	f.mappings.Store([]uintptr{})
  1395  	f.mu.Unlock()
  1396  
  1397  	// This must be called without holding f.mu to avoid circular lock
  1398  	// ordering.
  1399  	if f.stopNotifyPressure != nil {
  1400  		f.stopNotifyPressure()
  1401  	}
  1402  }
  1403  
  1404  // findReclaimable finds memory that has been marked for reclaim.
  1405  //
  1406  // Note that there returned range will be removed from tracking. It
  1407  // must be reclaimed (removed from f.usage) at this point.
  1408  func (f *MemoryFile) findReclaimable() (memmap.FileRange, bool) {
  1409  	f.mu.Lock()
  1410  	defer f.mu.Unlock()
  1411  	for {
  1412  		for {
  1413  			if f.destroyed {
  1414  				return memmap.FileRange{}, false
  1415  			}
  1416  			if f.reclaimable {
  1417  				break
  1418  			}
  1419  			if f.opts.DelayedEviction == DelayedEvictionEnabled && !f.opts.UseHostMemcgPressure {
  1420  				// No work to do. Evict any pending evictable allocations to
  1421  				// get more reclaimable pages before going to sleep.
  1422  				f.startEvictionsLocked()
  1423  			}
  1424  			f.reclaimCond.Wait()
  1425  		}
  1426  		// Most allocations are done upwards, with exceptions being stacks and some
  1427  		// allocators that allocate top-down. Reclaim preserves this order to
  1428  		// minimize the cost of the search.
  1429  		if seg := f.reclaim.FirstSegment(); seg.Ok() {
  1430  			fr := seg.Range()
  1431  			f.reclaim.Remove(seg)
  1432  			return fr, true
  1433  		}
  1434  		// Nothing is reclaimable.
  1435  		f.reclaimable = false
  1436  	}
  1437  }
  1438  
  1439  func (f *MemoryFile) markReclaimed(fr memmap.FileRange) {
  1440  	f.mu.Lock()
  1441  	defer f.mu.Unlock()
  1442  	seg := f.usage.FindSegment(fr.Start)
  1443  	// All of fr should be mapped to a single uncommitted reclaimable
  1444  	// segment accounted to System.
  1445  	if !seg.Ok() {
  1446  		panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage))
  1447  	}
  1448  	if !seg.Range().IsSupersetOf(fr) {
  1449  		panic(fmt.Sprintf("reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage))
  1450  	}
  1451  	if got, want := seg.Value(), (usageInfo{
  1452  		kind:           usage.System,
  1453  		knownCommitted: false,
  1454  		refs:           0,
  1455  		memCgID:        0,
  1456  	}); got != want {
  1457  		panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage))
  1458  	}
  1459  	// Deallocate reclaimed pages. Even though all of seg is reclaimable,
  1460  	// the caller of markReclaimed may not have decommitted it, so we can
  1461  	// only mark fr as reclaimed.
  1462  	f.usage.Remove(f.usage.Isolate(seg, fr))
  1463  }
  1464  
  1465  // StartEvictions requests that f evict all evictable allocations. It does not
  1466  // wait for eviction to complete; for this, see MemoryFile.WaitForEvictions.
  1467  func (f *MemoryFile) StartEvictions() {
  1468  	f.mu.Lock()
  1469  	defer f.mu.Unlock()
  1470  	f.startEvictionsLocked()
  1471  }
  1472  
  1473  // Preconditions: f.mu must be locked.
  1474  func (f *MemoryFile) startEvictionsLocked() bool {
  1475  	startedAny := false
  1476  	for user, info := range f.evictable {
  1477  		// Don't start multiple goroutines to evict the same user's
  1478  		// allocations.
  1479  		if !info.evicting {
  1480  			f.startEvictionGoroutineLocked(user, info)
  1481  			startedAny = true
  1482  		}
  1483  	}
  1484  	return startedAny
  1485  }
  1486  
  1487  // Preconditions:
  1488  //   - info == f.evictable[user].
  1489  //   - !info.evicting.
  1490  //   - f.mu must be locked.
  1491  func (f *MemoryFile) startEvictionGoroutineLocked(user EvictableMemoryUser, info *evictableMemoryUserInfo) {
  1492  	info.evicting = true
  1493  	f.evictionWG.Add(1)
  1494  	go func() { // S/R-SAFE: f.evictionWG
  1495  		defer f.evictionWG.Done()
  1496  		for {
  1497  			f.mu.Lock()
  1498  			info, ok := f.evictable[user]
  1499  			if !ok {
  1500  				// This shouldn't happen: only this goroutine is permitted
  1501  				// to delete this entry.
  1502  				f.mu.Unlock()
  1503  				panic(fmt.Sprintf("evictableMemoryUserInfo for EvictableMemoryUser %v deleted while eviction goroutine running", user))
  1504  			}
  1505  			if info.ranges.IsEmpty() {
  1506  				delete(f.evictable, user)
  1507  				f.mu.Unlock()
  1508  				return
  1509  			}
  1510  			// Evict from the end of info.ranges, under the assumption that
  1511  			// if ranges in user start being used again (and are
  1512  			// consequently marked unevictable), such uses are more likely
  1513  			// to start from the beginning of user.
  1514  			seg := info.ranges.LastSegment()
  1515  			er := seg.Range()
  1516  			info.ranges.Remove(seg)
  1517  			// user.Evict() must be called without holding f.mu to avoid
  1518  			// circular lock ordering.
  1519  			f.mu.Unlock()
  1520  			user.Evict(context.Background(), er)
  1521  		}
  1522  	}()
  1523  }
  1524  
  1525  // WaitForEvictions blocks until f is no longer evicting any evictable
  1526  // allocations.
  1527  func (f *MemoryFile) WaitForEvictions() {
  1528  	f.evictionWG.Wait()
  1529  }
  1530  
  1531  type usageSetFunctions struct{}
  1532  
  1533  func (usageSetFunctions) MinKey() uint64 {
  1534  	return 0
  1535  }
  1536  
  1537  func (usageSetFunctions) MaxKey() uint64 {
  1538  	return math.MaxUint64
  1539  }
  1540  
  1541  func (usageSetFunctions) ClearValue(val *usageInfo) {
  1542  }
  1543  
  1544  func (usageSetFunctions) Merge(_ memmap.FileRange, val1 usageInfo, _ memmap.FileRange, val2 usageInfo) (usageInfo, bool) {
  1545  	return val1, val1 == val2
  1546  }
  1547  
  1548  func (usageSetFunctions) Split(_ memmap.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) {
  1549  	return val, val
  1550  }
  1551  
  1552  // evictableRangeSetValue is the value type of evictableRangeSet.
  1553  type evictableRangeSetValue struct{}
  1554  
  1555  type evictableRangeSetFunctions struct{}
  1556  
  1557  func (evictableRangeSetFunctions) MinKey() uint64 {
  1558  	return 0
  1559  }
  1560  
  1561  func (evictableRangeSetFunctions) MaxKey() uint64 {
  1562  	return math.MaxUint64
  1563  }
  1564  
  1565  func (evictableRangeSetFunctions) ClearValue(val *evictableRangeSetValue) {
  1566  }
  1567  
  1568  func (evictableRangeSetFunctions) Merge(_ EvictableRange, _ evictableRangeSetValue, _ EvictableRange, _ evictableRangeSetValue) (evictableRangeSetValue, bool) {
  1569  	return evictableRangeSetValue{}, true
  1570  }
  1571  
  1572  func (evictableRangeSetFunctions) Split(_ EvictableRange, _ evictableRangeSetValue, _ uint64) (evictableRangeSetValue, evictableRangeSetValue) {
  1573  	return evictableRangeSetValue{}, evictableRangeSetValue{}
  1574  }
  1575  
  1576  // reclaimSetValue is the value type of reclaimSet.
  1577  type reclaimSetValue struct{}
  1578  
  1579  type reclaimSetFunctions struct{}
  1580  
  1581  func (reclaimSetFunctions) MinKey() uint64 {
  1582  	return 0
  1583  }
  1584  
  1585  func (reclaimSetFunctions) MaxKey() uint64 {
  1586  	return math.MaxUint64
  1587  }
  1588  
  1589  func (reclaimSetFunctions) ClearValue(val *reclaimSetValue) {
  1590  }
  1591  
  1592  func (reclaimSetFunctions) Merge(_ memmap.FileRange, _ reclaimSetValue, _ memmap.FileRange, _ reclaimSetValue) (reclaimSetValue, bool) {
  1593  	return reclaimSetValue{}, true
  1594  }
  1595  
  1596  func (reclaimSetFunctions) Split(_ memmap.FileRange, _ reclaimSetValue, _ uint64) (reclaimSetValue, reclaimSetValue) {
  1597  	return reclaimSetValue{}, reclaimSetValue{}
  1598  }