github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/mm/lifecycle.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package mm
    16  
    17  import (
    18  	"fmt"
    19  
    20  	"github.com/MerlinKodo/gvisor/pkg/atomicbitops"
    21  	"github.com/MerlinKodo/gvisor/pkg/context"
    22  	"github.com/MerlinKodo/gvisor/pkg/hostarch"
    23  	"github.com/MerlinKodo/gvisor/pkg/sentry/arch"
    24  	"github.com/MerlinKodo/gvisor/pkg/sentry/limits"
    25  	"github.com/MerlinKodo/gvisor/pkg/sentry/memmap"
    26  	"github.com/MerlinKodo/gvisor/pkg/sentry/pgalloc"
    27  	"github.com/MerlinKodo/gvisor/pkg/sentry/platform"
    28  )
    29  
    30  // NewMemoryManager returns a new MemoryManager with no mappings and 1 user.
    31  func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider, sleepForActivation bool) *MemoryManager {
    32  	return &MemoryManager{
    33  		p:                  p,
    34  		mfp:                mfp,
    35  		haveASIO:           p.SupportsAddressSpaceIO(),
    36  		privateRefs:        &privateRefs{},
    37  		users:              atomicbitops.FromInt32(1),
    38  		auxv:               arch.Auxv{},
    39  		dumpability:        atomicbitops.FromInt32(int32(UserDumpable)),
    40  		aioManager:         aioManager{contexts: make(map[uint64]*AIOContext)},
    41  		sleepForActivation: sleepForActivation,
    42  	}
    43  }
    44  
    45  // SetMmapLayout initializes mm's layout from the given arch.Context64.
    46  //
    47  // Preconditions: mm contains no mappings and is not used concurrently.
    48  func (mm *MemoryManager) SetMmapLayout(ac *arch.Context64, r *limits.LimitSet) (arch.MmapLayout, error) {
    49  	layout, err := ac.NewMmapLayout(mm.p.MinUserAddress(), mm.p.MaxUserAddress(), r)
    50  	if err != nil {
    51  		return arch.MmapLayout{}, err
    52  	}
    53  	mm.layout = layout
    54  	return layout, nil
    55  }
    56  
    57  // Fork creates a copy of mm with 1 user, as for Linux syscalls fork() or
    58  // clone() (without CLONE_VM).
    59  func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
    60  	mm.AddressSpace().PreFork()
    61  	defer mm.AddressSpace().PostFork()
    62  	mm.metadataMu.Lock()
    63  	defer mm.metadataMu.Unlock()
    64  
    65  	var droppedIDs []memmap.MappingIdentity
    66  	// This must run after {mm,mm2}.mappingMu.Unlock().
    67  	defer func() {
    68  		for _, id := range droppedIDs {
    69  			id.DecRef(ctx)
    70  		}
    71  	}()
    72  
    73  	mm.mappingMu.RLock()
    74  	defer mm.mappingMu.RUnlock()
    75  	mm2 := &MemoryManager{
    76  		p:           mm.p,
    77  		mfp:         mm.mfp,
    78  		haveASIO:    mm.haveASIO,
    79  		layout:      mm.layout,
    80  		privateRefs: mm.privateRefs,
    81  		users:       atomicbitops.FromInt32(1),
    82  		brk:         mm.brk,
    83  		usageAS:     mm.usageAS,
    84  		dataAS:      mm.dataAS,
    85  		// "The child does not inherit its parent's memory locks (mlock(2),
    86  		// mlockall(2))." - fork(2). So lockedAS is 0 and defMLockMode is
    87  		// MLockNone, both of which are zero values. vma.mlockMode is reset
    88  		// when copied below.
    89  		captureInvalidations: true,
    90  		argv:                 mm.argv,
    91  		envv:                 mm.envv,
    92  		auxv:                 append(arch.Auxv(nil), mm.auxv...),
    93  		// IncRef'd below, once we know that there isn't an error.
    94  		executable:         mm.executable,
    95  		dumpability:        atomicbitops.FromInt32(mm.dumpability.Load()),
    96  		aioManager:         aioManager{contexts: make(map[uint64]*AIOContext)},
    97  		sleepForActivation: mm.sleepForActivation,
    98  		vdsoSigReturnAddr:  mm.vdsoSigReturnAddr,
    99  	}
   100  
   101  	// Copy vmas.
   102  	dontforks := false
   103  	dstvgap := mm2.vmas.FirstGap()
   104  	for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() {
   105  		vma := srcvseg.ValuePtr().copy()
   106  		vmaAR := srcvseg.Range()
   107  
   108  		if vma.dontfork {
   109  			length := uint64(vmaAR.Length())
   110  			mm2.usageAS -= length
   111  			if vma.isPrivateDataLocked() {
   112  				mm2.dataAS -= length
   113  			}
   114  			dontforks = true
   115  			continue
   116  		}
   117  
   118  		// Inform the Mappable, if any, of the new mapping.
   119  		if vma.mappable != nil {
   120  			if err := vma.mappable.AddMapping(ctx, mm2, vmaAR, vma.off, vma.canWriteMappableLocked()); err != nil {
   121  				_, droppedIDs = mm2.removeVMAsLocked(ctx, mm2.applicationAddrRange(), droppedIDs)
   122  				return nil, err
   123  			}
   124  		}
   125  		if vma.id != nil {
   126  			vma.id.IncRef()
   127  		}
   128  		vma.mlockMode = memmap.MLockNone
   129  		dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, vma).NextGap()
   130  		// We don't need to update mm2.usageAS since we copied it from mm
   131  		// above.
   132  	}
   133  
   134  	// Copy pmas. We have to lock mm.activeMu for writing to make existing
   135  	// private pmas copy-on-write. We also have to lock mm2.activeMu since
   136  	// after copying vmas above, memmap.Mappables may call mm2.Invalidate. We
   137  	// only copy private pmas, since in the common case where fork(2) is
   138  	// immediately followed by execve(2), copying non-private pmas that can be
   139  	// regenerated by calling memmap.Mappable.Translate is a waste of time.
   140  	// (Linux does the same; compare kernel/fork.c:dup_mmap() =>
   141  	// mm/memory.c:copy_page_range().)
   142  	mm.activeMu.Lock()
   143  	defer mm.activeMu.Unlock()
   144  	mm2.activeMu.NestedLock(activeLockForked)
   145  	defer mm2.activeMu.NestedUnlock(activeLockForked)
   146  	if dontforks {
   147  		defer mm.pmas.MergeRange(mm.applicationAddrRange())
   148  	}
   149  	srcvseg := mm.vmas.FirstSegment()
   150  	dstpgap := mm2.pmas.FirstGap()
   151  	var unmapAR hostarch.AddrRange
   152  	memCgID := pgalloc.MemoryCgroupIDFromContext(ctx)
   153  	for srcpseg := mm.pmas.FirstSegment(); srcpseg.Ok(); srcpseg = srcpseg.NextSegment() {
   154  		pma := srcpseg.ValuePtr()
   155  		if !pma.private {
   156  			continue
   157  		}
   158  
   159  		if dontforks {
   160  			// Find the 'vma' that contains the starting address
   161  			// associated with the 'pma' (there must be one).
   162  			srcvseg = srcvseg.seekNextLowerBound(srcpseg.Start())
   163  			if checkInvariants {
   164  				if !srcvseg.Ok() {
   165  					panic(fmt.Sprintf("no vma covers pma range %v", srcpseg.Range()))
   166  				}
   167  				if srcpseg.Start() < srcvseg.Start() {
   168  					panic(fmt.Sprintf("vma %v ran ahead of pma %v", srcvseg.Range(), srcpseg.Range()))
   169  				}
   170  			}
   171  
   172  			srcpseg = mm.pmas.Isolate(srcpseg, srcvseg.Range())
   173  			if srcvseg.ValuePtr().dontfork {
   174  				continue
   175  			}
   176  			pma = srcpseg.ValuePtr()
   177  		}
   178  
   179  		if !pma.needCOW {
   180  			pma.needCOW = true
   181  			if pma.effectivePerms.Write {
   182  				// We don't want to unmap the whole address space, even though
   183  				// doing so would reduce calls to unmapASLocked(), because mm
   184  				// will most likely continue to be used after the fork, so
   185  				// unmapping pmas unnecessarily will result in extra page
   186  				// faults. But we do want to merge consecutive AddrRanges
   187  				// across pma boundaries.
   188  				if unmapAR.End == srcpseg.Start() {
   189  					unmapAR.End = srcpseg.End()
   190  				} else {
   191  					if unmapAR.Length() != 0 {
   192  						mm.unmapASLocked(unmapAR)
   193  					}
   194  					unmapAR = srcpseg.Range()
   195  				}
   196  				pma.effectivePerms.Write = false
   197  			}
   198  			pma.maxPerms.Write = false
   199  		}
   200  		fr := srcpseg.fileRange()
   201  		mm2.incPrivateRef(fr)
   202  		srcpseg.ValuePtr().file.IncRef(fr, memCgID)
   203  		addrRange := srcpseg.Range()
   204  		mm2.addRSSLocked(addrRange)
   205  		dstpgap = mm2.pmas.Insert(dstpgap, addrRange, *pma).NextGap()
   206  	}
   207  	if unmapAR.Length() != 0 {
   208  		mm.unmapASLocked(unmapAR)
   209  	}
   210  
   211  	// Between when we call memmap.Mappable.AddMapping while copying vmas and
   212  	// when we lock mm2.activeMu to copy pmas, calls to mm2.Invalidate() are
   213  	// ineffective because the pmas they invalidate haven't yet been copied,
   214  	// possibly allowing mm2 to get invalidated translations:
   215  	//
   216  	// Invalidating Mappable            mm.Fork
   217  	// ---------------------            -------
   218  	//
   219  	// mm2.Invalidate()
   220  	//                                  mm.activeMu.Lock()
   221  	// mm.Invalidate() /* blocks */
   222  	//                                  mm2.activeMu.Lock()
   223  	//                                  (mm copies invalidated pma to mm2)
   224  	//
   225  	// This would technically be both safe (since we only copy private pmas,
   226  	// which will still hold a reference on their memory) and consistent with
   227  	// Linux, but we avoid it anyway by setting mm2.captureInvalidations during
   228  	// construction, causing calls to mm2.Invalidate() to be captured in
   229  	// mm2.capturedInvalidations, to be replayed after pmas are copied - i.e.
   230  	// here.
   231  	mm2.captureInvalidations = false
   232  	for _, invArgs := range mm2.capturedInvalidations {
   233  		mm2.invalidateLocked(invArgs.ar, invArgs.opts.InvalidatePrivate, true)
   234  	}
   235  	mm2.capturedInvalidations = nil
   236  
   237  	if mm2.executable != nil {
   238  		mm2.executable.IncRef()
   239  	}
   240  	return mm2, nil
   241  }
   242  
   243  // IncUsers increments mm's user count and returns true. If the user count is
   244  // already 0, IncUsers does nothing and returns false.
   245  func (mm *MemoryManager) IncUsers() bool {
   246  	for {
   247  		users := mm.users.Load()
   248  		if users == 0 {
   249  			return false
   250  		}
   251  		if mm.users.CompareAndSwap(users, users+1) {
   252  			return true
   253  		}
   254  	}
   255  }
   256  
   257  // DecUsers decrements mm's user count. If the user count reaches 0, all
   258  // mappings in mm are unmapped.
   259  func (mm *MemoryManager) DecUsers(ctx context.Context) {
   260  	if users := mm.users.Add(-1); users > 0 {
   261  		return
   262  	} else if users < 0 {
   263  		panic(fmt.Sprintf("Invalid MemoryManager.users: %d", users))
   264  	}
   265  
   266  	mm.destroyAIOManager(ctx)
   267  
   268  	mm.metadataMu.Lock()
   269  	exe := mm.executable
   270  	mm.executable = nil
   271  	mm.metadataMu.Unlock()
   272  	if exe != nil {
   273  		exe.DecRef(ctx)
   274  	}
   275  
   276  	mm.activeMu.Lock()
   277  	// Sanity check.
   278  	if mm.active.Load() != 0 {
   279  		panic("active address space lost?")
   280  	}
   281  	// Make sure the AddressSpace is returned.
   282  	if mm.as != nil {
   283  		mm.as.Release()
   284  		mm.as = nil
   285  	}
   286  	mm.activeMu.Unlock()
   287  
   288  	var droppedIDs []memmap.MappingIdentity
   289  	mm.mappingMu.Lock()
   290  	// If mm is being dropped before mm.SetMmapLayout was called,
   291  	// mm.applicationAddrRange() will be empty.
   292  	if ar := mm.applicationAddrRange(); ar.Length() != 0 {
   293  		_, droppedIDs = mm.unmapLocked(ctx, ar, droppedIDs)
   294  	}
   295  	mm.mappingMu.Unlock()
   296  
   297  	for _, id := range droppedIDs {
   298  		id.DecRef(ctx)
   299  	}
   300  }