github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/mm/mm.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package mm provides a memory management subsystem. See README.md for a
    16  // detailed overview.
    17  //
    18  // Lock order:
    19  //
    20  // fs locks, except for memmap.Mappable locks
    21  //   mm.MemoryManager.metadataMu
    22  //     mm.MemoryManager.mappingMu
    23  //       Locks taken by memmap.Mappable methods other than Translate
    24  //         mm.MemoryManager.activeMu
    25  //           Locks taken by memmap.Mappable.Translate
    26  //             mm.privateRefs.mu
    27  //               platform.AddressSpace locks
    28  //                 memmap.File locks
    29  //         mm.aioManager.mu
    30  //           mm.AIOContext.mu
    31  //
    32  // Only mm.MemoryManager.Fork is permitted to lock mm.MemoryManager.activeMu in
    33  // multiple mm.MemoryManagers, as it does so in a well-defined order (forked
    34  // child first).
    35  package mm
    36  
    37  import (
    38  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    39  	"github.com/SagerNet/gvisor/pkg/hostarch"
    40  	"github.com/SagerNet/gvisor/pkg/safemem"
    41  	"github.com/SagerNet/gvisor/pkg/sentry/arch"
    42  	"github.com/SagerNet/gvisor/pkg/sentry/fsbridge"
    43  	"github.com/SagerNet/gvisor/pkg/sentry/memmap"
    44  	"github.com/SagerNet/gvisor/pkg/sentry/pgalloc"
    45  	"github.com/SagerNet/gvisor/pkg/sentry/platform"
    46  	"github.com/SagerNet/gvisor/pkg/sync"
    47  )
    48  
    49  // MemoryManager implements a virtual address space.
    50  //
    51  // +stateify savable
    52  type MemoryManager struct {
    53  	// p and mfp are immutable.
    54  	p   platform.Platform
    55  	mfp pgalloc.MemoryFileProvider
    56  
    57  	// haveASIO is the cached result of p.SupportsAddressSpaceIO(). Aside from
    58  	// eliminating an indirect call in the hot I/O path, this makes
    59  	// MemoryManager.asioEnabled() a leaf function, allowing it to be inlined.
    60  	//
    61  	// haveASIO is immutable.
    62  	haveASIO bool `state:"nosave"`
    63  
    64  	// layout is the memory layout.
    65  	//
    66  	// layout is set by the binary loader before the MemoryManager can be used.
    67  	layout arch.MmapLayout
    68  
    69  	// privateRefs stores reference counts for private memory (memory whose
    70  	// ownership is shared by one or more pmas instead of being owned by a
    71  	// memmap.Mappable).
    72  	//
    73  	// privateRefs is immutable.
    74  	privateRefs *privateRefs
    75  
    76  	// users is the number of dependencies on the mappings in the MemoryManager.
    77  	// When the number of references in users reaches zero, all mappings are
    78  	// unmapped.
    79  	//
    80  	// users is accessed using atomic memory operations.
    81  	users int32
    82  
    83  	// mappingMu is analogous to Linux's struct mm_struct::mmap_sem.
    84  	mappingMu sync.RWMutex `state:"nosave"`
    85  
    86  	// vmas stores virtual memory areas. Since vmas are stored by value,
    87  	// clients should usually use vmaIterator.ValuePtr() instead of
    88  	// vmaIterator.Value() to get a pointer to the vma rather than a copy.
    89  	//
    90  	// Invariants: vmas are always page-aligned.
    91  	//
    92  	// vmas is protected by mappingMu.
    93  	vmas vmaSet
    94  
    95  	// brk is the mm's brk, which is manipulated using the brk(2) system call.
    96  	// The brk is initially set up by the loader which maps an executable
    97  	// binary into the mm.
    98  	//
    99  	// brk is protected by mappingMu.
   100  	brk hostarch.AddrRange
   101  
   102  	// usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks.
   103  	//
   104  	// usageAS is protected by mappingMu.
   105  	usageAS uint64
   106  
   107  	// lockedAS is the combined size in bytes of all vmas with vma.mlockMode !=
   108  	// memmap.MLockNone.
   109  	//
   110  	// lockedAS is protected by mappingMu.
   111  	lockedAS uint64
   112  
   113  	// dataAS is the size of private data segments, like mm_struct->data_vm.
   114  	// It means the vma which is private, writable, not stack.
   115  	//
   116  	// dataAS is protected by mappingMu.
   117  	dataAS uint64
   118  
   119  	// New VMAs created by MMap use whichever of memmap.MMapOpts.MLockMode or
   120  	// defMLockMode is greater.
   121  	//
   122  	// defMLockMode is protected by mappingMu.
   123  	defMLockMode memmap.MLockMode
   124  
   125  	// activeMu is loosely analogous to Linux's struct
   126  	// mm_struct::page_table_lock.
   127  	activeMu sync.RWMutex `state:"nosave"`
   128  
   129  	// pmas stores platform mapping areas used to implement vmas. Since pmas
   130  	// are stored by value, clients should usually use pmaIterator.ValuePtr()
   131  	// instead of pmaIterator.Value() to get a pointer to the pma rather than
   132  	// a copy.
   133  	//
   134  	// Inserting or removing segments from pmas should happen along with a
   135  	// call to mm.insertRSS or mm.removeRSS.
   136  	//
   137  	// Invariants: pmas are always page-aligned. If a pma exists for a given
   138  	// address, a vma must also exist for that address.
   139  	//
   140  	// pmas is protected by activeMu.
   141  	pmas pmaSet
   142  
   143  	// curRSS is pmas.Span(), cached to accelerate updates to maxRSS. It is
   144  	// reported as the MemoryManager's RSS.
   145  	//
   146  	// maxRSS should be modified only via insertRSS and removeRSS, not
   147  	// directly.
   148  	//
   149  	// maxRSS is protected by activeMu.
   150  	curRSS uint64
   151  
   152  	// maxRSS is the maximum resident set size in bytes of a MemoryManager.
   153  	// It is tracked as the application adds and removes mappings to pmas.
   154  	//
   155  	// maxRSS should be modified only via insertRSS, not directly.
   156  	//
   157  	// maxRSS is protected by activeMu.
   158  	maxRSS uint64
   159  
   160  	// as is the platform.AddressSpace that pmas are mapped into. active is the
   161  	// number of contexts that require as to be non-nil; if active == 0, as may
   162  	// be nil.
   163  	//
   164  	// as is protected by activeMu. active is manipulated with atomic memory
   165  	// operations; transitions to and from zero are additionally protected by
   166  	// activeMu. (This is because such transitions may need to be atomic with
   167  	// changes to as.)
   168  	as     platform.AddressSpace `state:"nosave"`
   169  	active int32                 `state:"zerovalue"`
   170  
   171  	// unmapAllOnActivate indicates that the next Activate call should activate
   172  	// an empty AddressSpace.
   173  	//
   174  	// This is used to ensure that an AddressSpace cached in
   175  	// NewAddressSpace is not used after some change in the MemoryManager
   176  	// or VMAs has made that AddressSpace stale.
   177  	//
   178  	// unmapAllOnActivate is protected by activeMu. It must only be set when
   179  	// there is no active or cached AddressSpace. If as != nil, then
   180  	// invalidations should be propagated immediately.
   181  	unmapAllOnActivate bool `state:"nosave"`
   182  
   183  	// If captureInvalidations is true, calls to MM.Invalidate() are recorded
   184  	// in capturedInvalidations rather than being applied immediately to pmas.
   185  	// This is to avoid a race condition in MM.Fork(); see that function for
   186  	// details.
   187  	//
   188  	// Both captureInvalidations and capturedInvalidations are protected by
   189  	// activeMu. Neither need to be saved since captureInvalidations is only
   190  	// enabled during MM.Fork(), during which saving can't occur.
   191  	captureInvalidations  bool             `state:"zerovalue"`
   192  	capturedInvalidations []invalidateArgs `state:"nosave"`
   193  
   194  	metadataMu sync.Mutex `state:"nosave"`
   195  
   196  	// argv is the application argv. This is set up by the loader and may be
   197  	// modified by prctl(PR_SET_MM_ARG_START/PR_SET_MM_ARG_END). No
   198  	// requirements apply to argv; we do not require that argv.WellFormed().
   199  	//
   200  	// argv is protected by metadataMu.
   201  	argv hostarch.AddrRange
   202  
   203  	// envv is the application envv. This is set up by the loader and may be
   204  	// modified by prctl(PR_SET_MM_ENV_START/PR_SET_MM_ENV_END). No
   205  	// requirements apply to envv; we do not require that envv.WellFormed().
   206  	//
   207  	// envv is protected by metadataMu.
   208  	envv hostarch.AddrRange
   209  
   210  	// auxv is the ELF's auxiliary vector.
   211  	//
   212  	// auxv is protected by metadataMu.
   213  	auxv arch.Auxv
   214  
   215  	// executable is the executable for this MemoryManager. If executable
   216  	// is not nil, it holds a reference on the Dirent.
   217  	//
   218  	// executable is protected by metadataMu.
   219  	executable fsbridge.File
   220  
   221  	// dumpability describes if and how this MemoryManager may be dumped to
   222  	// userspace.
   223  	//
   224  	// dumpability is protected by metadataMu.
   225  	dumpability Dumpability
   226  
   227  	// aioManager keeps track of AIOContexts used for async IOs. AIOManager
   228  	// must be cloned when CLONE_VM is used.
   229  	aioManager aioManager
   230  
   231  	// sleepForActivation indicates whether the task should report to be sleeping
   232  	// before trying to activate the address space. When set to true, delays in
   233  	// activation are not reported as stuck tasks by the watchdog.
   234  	sleepForActivation bool
   235  
   236  	// vdsoSigReturnAddr is the address of 'vdso_sigreturn'.
   237  	vdsoSigReturnAddr uint64
   238  
   239  	// membarrierPrivateEnabled is non-zero if EnableMembarrierPrivate has
   240  	// previously been called. Since, as of this writing,
   241  	// MEMBARRIER_CMD_PRIVATE_EXPEDITED is implemented as a global memory
   242  	// barrier, membarrierPrivateEnabled has no other effect.
   243  	//
   244  	// membarrierPrivateEnabled is accessed using atomic memory operations.
   245  	membarrierPrivateEnabled uint32
   246  
   247  	// membarrierRSeqEnabled is non-zero if EnableMembarrierRSeq has previously
   248  	// been called.
   249  	//
   250  	// membarrierRSeqEnabled is accessed using atomic memory operations.
   251  	membarrierRSeqEnabled uint32
   252  }
   253  
   254  // vma represents a virtual memory area.
   255  //
   256  // +stateify savable
   257  type vma struct {
   258  	// mappable is the virtual memory object mapped by this vma. If mappable is
   259  	// nil, the vma represents an anonymous mapping.
   260  	mappable memmap.Mappable
   261  
   262  	// off is the offset into mappable at which this vma begins. If mappable is
   263  	// nil, off is meaningless.
   264  	off uint64
   265  
   266  	// To speedup VMA save/restore, we group and save the following booleans
   267  	// as a single integer.
   268  
   269  	// realPerms are the memory permissions on this vma, as defined by the
   270  	// application.
   271  	realPerms hostarch.AccessType `state:".(int)"`
   272  
   273  	// effectivePerms are the memory permissions on this vma which are
   274  	// actually used to control access.
   275  	//
   276  	// Invariant: effectivePerms == realPerms.Effective().
   277  	effectivePerms hostarch.AccessType `state:"manual"`
   278  
   279  	// maxPerms limits the set of permissions that may ever apply to this
   280  	// memory, as well as accesses for which usermem.IOOpts.IgnorePermissions
   281  	// is true (e.g. ptrace(PTRACE_POKEDATA)).
   282  	//
   283  	// Invariant: maxPerms == maxPerms.Effective().
   284  	maxPerms hostarch.AccessType `state:"manual"`
   285  
   286  	// private is true if this is a MAP_PRIVATE mapping, such that writes to
   287  	// the mapping are propagated to a copy.
   288  	private bool `state:"manual"`
   289  
   290  	// growsDown is true if the mapping may be automatically extended downward
   291  	// under certain conditions. If growsDown is true, mappable must be nil.
   292  	//
   293  	// There is currently no corresponding growsUp flag; in Linux, the only
   294  	// architectures that can have VM_GROWSUP mappings are ia64, parisc, and
   295  	// metag, none of which we currently support.
   296  	growsDown bool `state:"manual"`
   297  
   298  	// dontfork is the MADV_DONTFORK setting for this vma configured by madvise().
   299  	dontfork bool
   300  
   301  	mlockMode memmap.MLockMode
   302  
   303  	// numaPolicy is the NUMA policy for this vma set by mbind().
   304  	numaPolicy linux.NumaPolicy
   305  
   306  	// numaNodemask is the NUMA nodemask for this vma set by mbind().
   307  	numaNodemask uint64
   308  
   309  	// If id is not nil, it controls the lifecycle of mappable and provides vma
   310  	// metadata shown in /proc/[pid]/maps, and the vma holds a reference.
   311  	id memmap.MappingIdentity
   312  
   313  	// If hint is non-empty, it is a description of the vma printed in
   314  	// /proc/[pid]/maps. hint takes priority over id.MappedName().
   315  	hint string
   316  }
   317  
   318  const (
   319  	vmaRealPermsRead = 1 << iota
   320  	vmaRealPermsWrite
   321  	vmaRealPermsExecute
   322  	vmaEffectivePermsRead
   323  	vmaEffectivePermsWrite
   324  	vmaEffectivePermsExecute
   325  	vmaMaxPermsRead
   326  	vmaMaxPermsWrite
   327  	vmaMaxPermsExecute
   328  	vmaPrivate
   329  	vmaGrowsDown
   330  )
   331  
   332  func (v *vma) saveRealPerms() int {
   333  	var b int
   334  	if v.realPerms.Read {
   335  		b |= vmaRealPermsRead
   336  	}
   337  	if v.realPerms.Write {
   338  		b |= vmaRealPermsWrite
   339  	}
   340  	if v.realPerms.Execute {
   341  		b |= vmaRealPermsExecute
   342  	}
   343  	if v.effectivePerms.Read {
   344  		b |= vmaEffectivePermsRead
   345  	}
   346  	if v.effectivePerms.Write {
   347  		b |= vmaEffectivePermsWrite
   348  	}
   349  	if v.effectivePerms.Execute {
   350  		b |= vmaEffectivePermsExecute
   351  	}
   352  	if v.maxPerms.Read {
   353  		b |= vmaMaxPermsRead
   354  	}
   355  	if v.maxPerms.Write {
   356  		b |= vmaMaxPermsWrite
   357  	}
   358  	if v.maxPerms.Execute {
   359  		b |= vmaMaxPermsExecute
   360  	}
   361  	if v.private {
   362  		b |= vmaPrivate
   363  	}
   364  	if v.growsDown {
   365  		b |= vmaGrowsDown
   366  	}
   367  	return b
   368  }
   369  
   370  func (v *vma) loadRealPerms(b int) {
   371  	if b&vmaRealPermsRead > 0 {
   372  		v.realPerms.Read = true
   373  	}
   374  	if b&vmaRealPermsWrite > 0 {
   375  		v.realPerms.Write = true
   376  	}
   377  	if b&vmaRealPermsExecute > 0 {
   378  		v.realPerms.Execute = true
   379  	}
   380  	if b&vmaEffectivePermsRead > 0 {
   381  		v.effectivePerms.Read = true
   382  	}
   383  	if b&vmaEffectivePermsWrite > 0 {
   384  		v.effectivePerms.Write = true
   385  	}
   386  	if b&vmaEffectivePermsExecute > 0 {
   387  		v.effectivePerms.Execute = true
   388  	}
   389  	if b&vmaMaxPermsRead > 0 {
   390  		v.maxPerms.Read = true
   391  	}
   392  	if b&vmaMaxPermsWrite > 0 {
   393  		v.maxPerms.Write = true
   394  	}
   395  	if b&vmaMaxPermsExecute > 0 {
   396  		v.maxPerms.Execute = true
   397  	}
   398  	if b&vmaPrivate > 0 {
   399  		v.private = true
   400  	}
   401  	if b&vmaGrowsDown > 0 {
   402  		v.growsDown = true
   403  	}
   404  }
   405  
   406  // pma represents a platform mapping area.
   407  //
   408  // +stateify savable
   409  type pma struct {
   410  	// file is the file mapped by this pma. Only pmas for which file ==
   411  	// MemoryManager.mfp.MemoryFile() may be saved. pmas hold a reference to
   412  	// the corresponding file range while they exist.
   413  	file memmap.File `state:"nosave"`
   414  
   415  	// off is the offset into file at which this pma begins.
   416  	//
   417  	// Note that pmas do *not* hold references on offsets in file! If private
   418  	// is true, MemoryManager.privateRefs holds the reference instead. If
   419  	// private is false, the corresponding memmap.Mappable holds the reference
   420  	// instead (per memmap.Mappable.Translate requirement).
   421  	off uint64
   422  
   423  	// translatePerms is the permissions returned by memmap.Mappable.Translate.
   424  	// If private is true, translatePerms is hostarch.AnyAccess.
   425  	translatePerms hostarch.AccessType
   426  
   427  	// effectivePerms is the permissions allowed for non-ignorePermissions
   428  	// accesses. maxPerms is the permissions allowed for ignorePermissions
   429  	// accesses. These are vma.effectivePerms and vma.maxPerms respectively,
   430  	// masked by pma.translatePerms and with Write disallowed if pma.needCOW is
   431  	// true.
   432  	//
   433  	// These are stored in the pma so that the IO implementation can avoid
   434  	// iterating mm.vmas when pmas already exist.
   435  	effectivePerms hostarch.AccessType
   436  	maxPerms       hostarch.AccessType
   437  
   438  	// needCOW is true if writes to the mapping must be propagated to a copy.
   439  	needCOW bool
   440  
   441  	// private is true if this pma represents private memory.
   442  	//
   443  	// If private is true, file must be MemoryManager.mfp.MemoryFile(), the pma
   444  	// holds a reference on the mapped memory that is tracked in privateRefs,
   445  	// and calls to Invalidate for which
   446  	// memmap.InvalidateOpts.InvalidatePrivate is false should ignore the pma.
   447  	//
   448  	// If private is false, this pma caches a translation from the
   449  	// corresponding vma's memmap.Mappable.Translate.
   450  	private bool
   451  
   452  	// If internalMappings is not empty, it is the cached return value of
   453  	// file.MapInternal for the memmap.FileRange mapped by this pma.
   454  	internalMappings safemem.BlockSeq `state:"nosave"`
   455  }
   456  
   457  // +stateify savable
   458  type privateRefs struct {
   459  	mu sync.Mutex `state:"nosave"`
   460  
   461  	// refs maps offsets into MemoryManager.mfp.MemoryFile() to the number of
   462  	// pmas (or, equivalently, MemoryManagers) that share ownership of the
   463  	// memory at that offset.
   464  	refs fileRefcountSet
   465  }
   466  
   467  type invalidateArgs struct {
   468  	ar   hostarch.AddrRange
   469  	opts memmap.InvalidateOpts
   470  }
   471  
   472  // fileRefcountSetFunctions implements segment.Functions for fileRefcountSet.
   473  type fileRefcountSetFunctions struct{}
   474  
   475  func (fileRefcountSetFunctions) MinKey() uint64 {
   476  	return 0
   477  }
   478  
   479  func (fileRefcountSetFunctions) MaxKey() uint64 {
   480  	return ^uint64(0)
   481  }
   482  
   483  func (fileRefcountSetFunctions) ClearValue(_ *int32) {
   484  }
   485  
   486  func (fileRefcountSetFunctions) Merge(_ memmap.FileRange, rc1 int32, _ memmap.FileRange, rc2 int32) (int32, bool) {
   487  	return rc1, rc1 == rc2
   488  }
   489  
   490  func (fileRefcountSetFunctions) Split(_ memmap.FileRange, rc int32, _ uint64) (int32, int32) {
   491  	return rc, rc
   492  }