github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/mm/mm.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package mm provides a memory management subsystem. See README.md for a
    16  // detailed overview.
    17  //
    18  // Lock order:
    19  //
    20  //	fs locks, except for memmap.Mappable locks
    21  //		mm.MemoryManager.metadataMu
    22  //			mm.MemoryManager.mappingMu
    23  //				Locks taken by memmap.MappingIdentity and memmap.Mappable methods other
    24  //				than Translate
    25  //					kernel.TaskSet.mu
    26  //						mm.MemoryManager.activeMu
    27  //							Locks taken by memmap.Mappable.Translate
    28  //								platform.AddressSpace locks
    29  //									memmap.File locks
    30  //					mm.aioManager.mu
    31  //						mm.AIOContext.mu
    32  //
    33  // Only mm.MemoryManager.Fork is permitted to lock mm.MemoryManager.activeMu in
    34  // multiple mm.MemoryManagers, as it does so in a well-defined order (forked
    35  // child first).
    36  package mm
    37  
    38  import (
    39  	"sync/atomic"
    40  
    41  	"github.com/metacubex/gvisor/pkg/abi/linux"
    42  	"github.com/metacubex/gvisor/pkg/atomicbitops"
    43  	"github.com/metacubex/gvisor/pkg/hostarch"
    44  	"github.com/metacubex/gvisor/pkg/safemem"
    45  	"github.com/metacubex/gvisor/pkg/sentry/arch"
    46  	"github.com/metacubex/gvisor/pkg/sentry/memmap"
    47  	"github.com/metacubex/gvisor/pkg/sentry/pgalloc"
    48  	"github.com/metacubex/gvisor/pkg/sentry/platform"
    49  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    50  )
    51  
    52  // MapsCallbackFunc has all the parameters required for populating an entry of /proc/[pid]/maps.
    53  type MapsCallbackFunc func(start, end hostarch.Addr, permissions hostarch.AccessType, private string, offset uint64, devMajor, devMinor uint32, inode uint64, path string)
    54  
    55  // MemoryManager implements a virtual address space.
    56  //
    57  // +stateify savable
    58  type MemoryManager struct {
    59  	// p and mfp are immutable.
    60  	p platform.Platform
    61  
    62  	// mf is the cached result of mfp.MemoryFile().
    63  	//
    64  	// mf is immutable.
    65  	mf *pgalloc.MemoryFile `state:"nosave"`
    66  
    67  	// haveASIO is the cached result of p.SupportsAddressSpaceIO(). Aside from
    68  	// eliminating an indirect call in the hot I/O path, this makes
    69  	// MemoryManager.asioEnabled() a leaf function, allowing it to be inlined.
    70  	//
    71  	// haveASIO is immutable.
    72  	haveASIO bool `state:"nosave"`
    73  
    74  	// layout is the memory layout.
    75  	//
    76  	// layout is set by the binary loader before the MemoryManager can be used.
    77  	layout arch.MmapLayout
    78  
    79  	// users is the number of dependencies on the mappings in the MemoryManager.
    80  	// When the number of references in users reaches zero, all mappings are
    81  	// unmapped.
    82  	users atomicbitops.Int32
    83  
    84  	// mappingMu is analogous to Linux's struct mm_struct::mmap_sem.
    85  	mappingMu mappingRWMutex `state:"nosave"`
    86  
    87  	// vmas stores virtual memory areas. Since vmas are stored by value,
    88  	// clients should usually use vmaIterator.ValuePtr() instead of
    89  	// vmaIterator.Value() to get a pointer to the vma rather than a copy.
    90  	//
    91  	// Invariants: vmas are always page-aligned.
    92  	//
    93  	// vmas is protected by mappingMu.
    94  	vmas vmaSet
    95  
    96  	// brk is the mm's brk, which is manipulated using the brk(2) system call.
    97  	// The brk is initially set up by the loader which maps an executable
    98  	// binary into the mm.
    99  	//
   100  	// brk is protected by mappingMu.
   101  	brk hostarch.AddrRange
   102  
   103  	// usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks.
   104  	//
   105  	// usageAS is protected by mappingMu.
   106  	usageAS uint64
   107  
   108  	// lockedAS is the combined size in bytes of all vmas with vma.mlockMode !=
   109  	// memmap.MLockNone.
   110  	//
   111  	// lockedAS is protected by mappingMu.
   112  	lockedAS uint64
   113  
   114  	// dataAS is the size of private data segments, like mm_struct->data_vm.
   115  	// It means the vma which is private, writable, not stack.
   116  	//
   117  	// dataAS is protected by mappingMu.
   118  	dataAS uint64
   119  
   120  	// New VMAs created by MMap use whichever of memmap.MMapOpts.MLockMode or
   121  	// defMLockMode is greater.
   122  	//
   123  	// defMLockMode is protected by mappingMu.
   124  	defMLockMode memmap.MLockMode
   125  
   126  	// activeMu is loosely analogous to Linux's struct
   127  	// mm_struct::page_table_lock.
   128  	activeMu activeRWMutex `state:"nosave"`
   129  
   130  	// pmas stores platform mapping areas used to implement vmas. Since pmas
   131  	// are stored by value, clients should usually use pmaIterator.ValuePtr()
   132  	// instead of pmaIterator.Value() to get a pointer to the pma rather than
   133  	// a copy.
   134  	//
   135  	// Inserting or removing segments from pmas should happen along with a
   136  	// call to mm.insertRSS or mm.removeRSS.
   137  	//
   138  	// Invariants: pmas are always page-aligned. If a pma exists for a given
   139  	// address, a vma must also exist for that address.
   140  	//
   141  	// pmas is protected by activeMu.
   142  	pmas pmaSet
   143  
   144  	// curRSS is pmas.Span(), cached to accelerate updates to maxRSS. It is
   145  	// reported as the MemoryManager's RSS.
   146  	//
   147  	// maxRSS should be modified only via insertRSS and removeRSS, not
   148  	// directly.
   149  	//
   150  	// maxRSS is protected by activeMu.
   151  	curRSS uint64
   152  
   153  	// maxRSS is the maximum resident set size in bytes of a MemoryManager.
   154  	// It is tracked as the application adds and removes mappings to pmas.
   155  	//
   156  	// maxRSS should be modified only via insertRSS, not directly.
   157  	//
   158  	// maxRSS is protected by activeMu.
   159  	maxRSS uint64
   160  
   161  	// as is the platform.AddressSpace that pmas are mapped into. active is the
   162  	// number of contexts that require as to be non-nil; if active == 0, as may
   163  	// be nil.
   164  	//
   165  	// as is protected by activeMu. active is manipulated with atomic memory
   166  	// operations; transitions to and from zero are additionally protected by
   167  	// activeMu. (This is because such transitions may need to be atomic with
   168  	// changes to as.)
   169  	as     platform.AddressSpace `state:"nosave"`
   170  	active atomicbitops.Int32    `state:"zerovalue"`
   171  
   172  	// unmapAllOnActivate indicates that the next Activate call should activate
   173  	// an empty AddressSpace.
   174  	//
   175  	// This is used to ensure that an AddressSpace cached in
   176  	// NewAddressSpace is not used after some change in the MemoryManager
   177  	// or VMAs has made that AddressSpace stale.
   178  	//
   179  	// unmapAllOnActivate is protected by activeMu. It must only be set when
   180  	// there is no active or cached AddressSpace. If as != nil, then
   181  	// invalidations should be propagated immediately.
   182  	unmapAllOnActivate bool `state:"nosave"`
   183  
   184  	// If captureInvalidations is true, calls to MM.Invalidate() are recorded
   185  	// in capturedInvalidations rather than being applied immediately to pmas.
   186  	// This is to avoid a race condition in MM.Fork(); see that function for
   187  	// details.
   188  	//
   189  	// Both captureInvalidations and capturedInvalidations are protected by
   190  	// activeMu. Neither need to be saved since captureInvalidations is only
   191  	// enabled during MM.Fork(), during which saving can't occur.
   192  	captureInvalidations  bool             `state:"zerovalue"`
   193  	capturedInvalidations []invalidateArgs `state:"nosave"`
   194  
   195  	// dumpability describes if and how this MemoryManager may be dumped to
   196  	// userspace. This is read under kernel.TaskSet.mu, so it can't be protected
   197  	// by metadataMu.
   198  	dumpability atomicbitops.Int32
   199  
   200  	metadataMu metadataMutex `state:"nosave"`
   201  
   202  	// argv is the application argv. This is set up by the loader and may be
   203  	// modified by prctl(PR_SET_MM_ARG_START/PR_SET_MM_ARG_END). No
   204  	// requirements apply to argv; we do not require that argv.WellFormed().
   205  	//
   206  	// argv is protected by metadataMu.
   207  	argv hostarch.AddrRange
   208  
   209  	// envv is the application envv. This is set up by the loader and may be
   210  	// modified by prctl(PR_SET_MM_ENV_START/PR_SET_MM_ENV_END). No
   211  	// requirements apply to envv; we do not require that envv.WellFormed().
   212  	//
   213  	// envv is protected by metadataMu.
   214  	envv hostarch.AddrRange
   215  
   216  	// auxv is the ELF's auxiliary vector.
   217  	//
   218  	// auxv is protected by metadataMu.
   219  	auxv arch.Auxv
   220  
   221  	// executable is the executable for this MemoryManager. If executable
   222  	// is not nil, it holds a reference on the Dirent.
   223  	//
   224  	// executable is protected by metadataMu.
   225  	executable *vfs.FileDescription
   226  
   227  	// aioManager keeps track of AIOContexts used for async IOs. AIOManager
   228  	// must be cloned when CLONE_VM is used.
   229  	aioManager aioManager
   230  
   231  	// sleepForActivation indicates whether the task should report to be sleeping
   232  	// before trying to activate the address space. When set to true, delays in
   233  	// activation are not reported as stuck tasks by the watchdog.
   234  	sleepForActivation bool
   235  
   236  	// vdsoSigReturnAddr is the address of 'vdso_sigreturn'.
   237  	vdsoSigReturnAddr uint64
   238  
   239  	// membarrierPrivateEnabled is non-zero if EnableMembarrierPrivate has
   240  	// previously been called. Since, as of this writing,
   241  	// MEMBARRIER_CMD_PRIVATE_EXPEDITED is implemented as a global memory
   242  	// barrier, membarrierPrivateEnabled has no other effect.
   243  	membarrierPrivateEnabled atomicbitops.Uint32
   244  
   245  	// membarrierRSeqEnabled is non-zero if EnableMembarrierRSeq has previously
   246  	// been called.
   247  	membarrierRSeqEnabled atomicbitops.Uint32
   248  }
   249  
   250  // vma represents a virtual memory area.
   251  //
   252  // Note: new fields added to this struct must be added to vma.Copy and
   253  // vmaSetFunctions.Merge.
   254  //
   255  // +stateify savable
   256  type vma struct {
   257  	// mappable is the virtual memory object mapped by this vma. If mappable is
   258  	// nil, the vma represents an anonymous mapping.
   259  	mappable memmap.Mappable
   260  
   261  	// off is the offset into mappable at which this vma begins. If mappable is
   262  	// nil, off is meaningless.
   263  	off uint64
   264  
   265  	// To speedup VMA save/restore, we group and save the following booleans
   266  	// as a single integer.
   267  
   268  	// realPerms are the memory permissions on this vma, as defined by the
   269  	// application.
   270  	realPerms hostarch.AccessType `state:".(int)"`
   271  
   272  	// effectivePerms are the memory permissions on this vma which are
   273  	// actually used to control access.
   274  	//
   275  	// Invariant: effectivePerms == realPerms.Effective().
   276  	effectivePerms hostarch.AccessType `state:"manual"`
   277  
   278  	// maxPerms limits the set of permissions that may ever apply to this
   279  	// memory, as well as accesses for which usermem.IOOpts.IgnorePermissions
   280  	// is true (e.g. ptrace(PTRACE_POKEDATA)).
   281  	//
   282  	// Invariant: maxPerms == maxPerms.Effective().
   283  	maxPerms hostarch.AccessType `state:"manual"`
   284  
   285  	// private is true if this is a MAP_PRIVATE mapping, such that writes to
   286  	// the mapping are propagated to a copy.
   287  	private bool `state:"manual"`
   288  
   289  	// growsDown is true if the mapping may be automatically extended downward
   290  	// under certain conditions. If growsDown is true, mappable must be nil.
   291  	//
   292  	// There is currently no corresponding growsUp flag; in Linux, the only
   293  	// architectures that can have VM_GROWSUP mappings are ia64, parisc, and
   294  	// metag, none of which we currently support.
   295  	growsDown bool `state:"manual"`
   296  
   297  	// dontfork is the MADV_DONTFORK setting for this vma configured by madvise().
   298  	dontfork bool
   299  
   300  	mlockMode memmap.MLockMode
   301  
   302  	// numaPolicy is the NUMA policy for this vma set by mbind().
   303  	numaPolicy linux.NumaPolicy
   304  
   305  	// numaNodemask is the NUMA nodemask for this vma set by mbind().
   306  	numaNodemask uint64
   307  
   308  	// If id is not nil, it controls the lifecycle of mappable and provides vma
   309  	// metadata shown in /proc/[pid]/maps, and the vma holds a reference.
   310  	id memmap.MappingIdentity
   311  
   312  	// If hint is non-empty, it is a description of the vma printed in
   313  	// /proc/[pid]/maps. hint takes priority over id.MappedName().
   314  	hint string
   315  
   316  	// lastFault records the last address that was paged faulted. It hints at
   317  	// which direction addresses in this vma are being accessed.
   318  	//
   319  	// This field can be read atomically, and written with mm.activeMu locked for
   320  	// writing and mm.mapping locked.
   321  	lastFault uintptr
   322  }
   323  
   324  func (v *vma) copy() vma {
   325  	return vma{
   326  		mappable:       v.mappable,
   327  		off:            v.off,
   328  		realPerms:      v.realPerms,
   329  		effectivePerms: v.effectivePerms,
   330  		maxPerms:       v.maxPerms,
   331  		private:        v.private,
   332  		growsDown:      v.growsDown,
   333  		dontfork:       v.dontfork,
   334  		mlockMode:      v.mlockMode,
   335  		numaPolicy:     v.numaPolicy,
   336  		numaNodemask:   v.numaNodemask,
   337  		id:             v.id,
   338  		hint:           v.hint,
   339  		lastFault:      atomic.LoadUintptr(&v.lastFault),
   340  	}
   341  }
   342  
   343  // pma represents a platform mapping area.
   344  //
   345  // +stateify savable
   346  type pma struct {
   347  	// file is the file mapped by this pma. Only pmas for which file is of type
   348  	// pgalloc.MemoryFile may be saved. pmas hold a reference to the
   349  	// corresponding file range while they exist.
   350  	file memmap.File `state:".(string)"`
   351  
   352  	// off is the offset into file at which this pma begins.
   353  	off uint64
   354  
   355  	// translatePerms is the permissions returned by memmap.Mappable.Translate.
   356  	// If private is true, translatePerms is hostarch.AnyAccess.
   357  	translatePerms hostarch.AccessType
   358  
   359  	// effectivePerms is the permissions allowed for non-ignorePermissions
   360  	// accesses. maxPerms is the permissions allowed for ignorePermissions
   361  	// accesses. These are vma.effectivePerms and vma.maxPerms respectively,
   362  	// masked by pma.translatePerms and with Write disallowed if pma.needCOW is
   363  	// true.
   364  	//
   365  	// These are stored in the pma so that the IO implementation can avoid
   366  	// iterating mm.vmas when pmas already exist.
   367  	effectivePerms hostarch.AccessType
   368  	maxPerms       hostarch.AccessType
   369  
   370  	// needCOW is true if writes to the mapping must be propagated to a copy.
   371  	needCOW bool
   372  
   373  	// private is true if this pma represents private memory.
   374  	//
   375  	// If private is true, file must be MemoryManager.mfp.MemoryFile(), and
   376  	// calls to Invalidate for which memmap.InvalidateOpts.InvalidatePrivate is
   377  	// false should ignore the pma.
   378  	//
   379  	// If private is false, this pma caches a translation from the
   380  	// corresponding vma's memmap.Mappable.Translate.
   381  	private bool
   382  
   383  	// If internalMappings is not empty, it is the cached return value of
   384  	// file.MapInternal for the memmap.FileRange mapped by this pma.
   385  	internalMappings safemem.BlockSeq `state:"nosave"`
   386  }
   387  
   388  type invalidateArgs struct {
   389  	ar   hostarch.AddrRange
   390  	opts memmap.InvalidateOpts
   391  }