github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/mm/lifecycle.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package mm
    16  
    17  import (
    18  	"fmt"
    19  
    20  	"github.com/metacubex/gvisor/pkg/atomicbitops"
    21  	"github.com/metacubex/gvisor/pkg/context"
    22  	"github.com/metacubex/gvisor/pkg/hostarch"
    23  	"github.com/metacubex/gvisor/pkg/sentry/arch"
    24  	"github.com/metacubex/gvisor/pkg/sentry/limits"
    25  	"github.com/metacubex/gvisor/pkg/sentry/memmap"
    26  	"github.com/metacubex/gvisor/pkg/sentry/pgalloc"
    27  	"github.com/metacubex/gvisor/pkg/sentry/platform"
    28  )
    29  
    30  // NewMemoryManager returns a new MemoryManager with no mappings and 1 user.
    31  func NewMemoryManager(p platform.Platform, mf *pgalloc.MemoryFile, sleepForActivation bool) *MemoryManager {
    32  	return &MemoryManager{
    33  		p:                  p,
    34  		mf:                 mf,
    35  		haveASIO:           p.SupportsAddressSpaceIO(),
    36  		users:              atomicbitops.FromInt32(1),
    37  		auxv:               arch.Auxv{},
    38  		dumpability:        atomicbitops.FromInt32(int32(UserDumpable)),
    39  		aioManager:         aioManager{contexts: make(map[uint64]*AIOContext)},
    40  		sleepForActivation: sleepForActivation,
    41  	}
    42  }
    43  
    44  // SetMmapLayout initializes mm's layout from the given arch.Context64.
    45  //
    46  // Preconditions: mm contains no mappings and is not used concurrently.
    47  func (mm *MemoryManager) SetMmapLayout(ac *arch.Context64, r *limits.LimitSet) (arch.MmapLayout, error) {
    48  	layout, err := ac.NewMmapLayout(mm.p.MinUserAddress(), mm.p.MaxUserAddress(), r)
    49  	if err != nil {
    50  		return arch.MmapLayout{}, err
    51  	}
    52  	mm.layout = layout
    53  	return layout, nil
    54  }
    55  
    56  // Fork creates a copy of mm with 1 user, as for Linux syscalls fork() or
    57  // clone() (without CLONE_VM).
    58  func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
    59  	mm.AddressSpace().PreFork()
    60  	defer mm.AddressSpace().PostFork()
    61  	mm.metadataMu.Lock()
    62  	defer mm.metadataMu.Unlock()
    63  
    64  	var droppedIDs []memmap.MappingIdentity
    65  	// This must run after {mm,mm2}.mappingMu.Unlock().
    66  	defer func() {
    67  		for _, id := range droppedIDs {
    68  			id.DecRef(ctx)
    69  		}
    70  	}()
    71  
    72  	mm.mappingMu.RLock()
    73  	defer mm.mappingMu.RUnlock()
    74  	mm2 := &MemoryManager{
    75  		p:        mm.p,
    76  		mf:       mm.mf,
    77  		haveASIO: mm.haveASIO,
    78  		layout:   mm.layout,
    79  		users:    atomicbitops.FromInt32(1),
    80  		brk:      mm.brk,
    81  		usageAS:  mm.usageAS,
    82  		dataAS:   mm.dataAS,
    83  		// "The child does not inherit its parent's memory locks (mlock(2),
    84  		// mlockall(2))." - fork(2). So lockedAS is 0 and defMLockMode is
    85  		// MLockNone, both of which are zero values. vma.mlockMode is reset
    86  		// when copied below.
    87  		captureInvalidations: true,
    88  		argv:                 mm.argv,
    89  		envv:                 mm.envv,
    90  		auxv:                 append(arch.Auxv(nil), mm.auxv...),
    91  		// IncRef'd below, once we know that there isn't an error.
    92  		executable:         mm.executable,
    93  		dumpability:        atomicbitops.FromInt32(mm.dumpability.Load()),
    94  		aioManager:         aioManager{contexts: make(map[uint64]*AIOContext)},
    95  		sleepForActivation: mm.sleepForActivation,
    96  		vdsoSigReturnAddr:  mm.vdsoSigReturnAddr,
    97  	}
    98  
    99  	// Copy vmas.
   100  	dontforks := false
   101  	dstvgap := mm2.vmas.FirstGap()
   102  	for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() {
   103  		vma := srcvseg.ValuePtr().copy()
   104  		vmaAR := srcvseg.Range()
   105  
   106  		if vma.dontfork {
   107  			length := uint64(vmaAR.Length())
   108  			mm2.usageAS -= length
   109  			if vma.isPrivateDataLocked() {
   110  				mm2.dataAS -= length
   111  			}
   112  			dontforks = true
   113  			continue
   114  		}
   115  
   116  		// Inform the Mappable, if any, of the new mapping.
   117  		if vma.mappable != nil {
   118  			if err := vma.mappable.AddMapping(ctx, mm2, vmaAR, vma.off, vma.canWriteMappableLocked()); err != nil {
   119  				_, droppedIDs = mm2.removeVMAsLocked(ctx, mm2.applicationAddrRange(), droppedIDs)
   120  				return nil, err
   121  			}
   122  		}
   123  		if vma.id != nil {
   124  			vma.id.IncRef()
   125  		}
   126  		vma.mlockMode = memmap.MLockNone
   127  		dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, vma).NextGap()
   128  		// We don't need to update mm2.usageAS since we copied it from mm
   129  		// above.
   130  	}
   131  
   132  	// Copy pmas. We have to lock mm.activeMu for writing to make existing
   133  	// private pmas copy-on-write. We also have to lock mm2.activeMu since
   134  	// after copying vmas above, memmap.Mappables may call mm2.Invalidate. We
   135  	// only copy private pmas, since in the common case where fork(2) is
   136  	// immediately followed by execve(2), copying non-private pmas that can be
   137  	// regenerated by calling memmap.Mappable.Translate is a waste of time.
   138  	// (Linux does the same; compare kernel/fork.c:dup_mmap() =>
   139  	// mm/memory.c:copy_page_range().)
   140  	mm.activeMu.Lock()
   141  	defer mm.activeMu.Unlock()
   142  	mm2.activeMu.NestedLock(activeLockForked)
   143  	defer mm2.activeMu.NestedUnlock(activeLockForked)
   144  	if dontforks {
   145  		defer mm.pmas.MergeInsideRange(mm.applicationAddrRange())
   146  	}
   147  	srcvseg := mm.vmas.FirstSegment()
   148  	dstpgap := mm2.pmas.FirstGap()
   149  	var unmapAR hostarch.AddrRange
   150  	memCgID := pgalloc.MemoryCgroupIDFromContext(ctx)
   151  	for srcpseg := mm.pmas.FirstSegment(); srcpseg.Ok(); srcpseg = srcpseg.NextSegment() {
   152  		pma := srcpseg.ValuePtr()
   153  		if !pma.private {
   154  			continue
   155  		}
   156  
   157  		if dontforks {
   158  			// Find the 'vma' that contains the starting address
   159  			// associated with the 'pma' (there must be one).
   160  			srcvseg = srcvseg.seekNextLowerBound(srcpseg.Start())
   161  			if checkInvariants {
   162  				if !srcvseg.Ok() {
   163  					panic(fmt.Sprintf("no vma covers pma range %v", srcpseg.Range()))
   164  				}
   165  				if srcpseg.Start() < srcvseg.Start() {
   166  					panic(fmt.Sprintf("vma %v ran ahead of pma %v", srcvseg.Range(), srcpseg.Range()))
   167  				}
   168  			}
   169  
   170  			srcpseg = mm.pmas.Isolate(srcpseg, srcvseg.Range())
   171  			if srcvseg.ValuePtr().dontfork {
   172  				continue
   173  			}
   174  			pma = srcpseg.ValuePtr()
   175  		}
   176  
   177  		if !pma.needCOW {
   178  			pma.needCOW = true
   179  			if pma.effectivePerms.Write {
   180  				// We don't want to unmap the whole address space, even though
   181  				// doing so would reduce calls to unmapASLocked(), because mm
   182  				// will most likely continue to be used after the fork, so
   183  				// unmapping pmas unnecessarily will result in extra page
   184  				// faults. But we do want to merge consecutive AddrRanges
   185  				// across pma boundaries.
   186  				if unmapAR.End == srcpseg.Start() {
   187  					unmapAR.End = srcpseg.End()
   188  				} else {
   189  					if unmapAR.Length() != 0 {
   190  						mm.unmapASLocked(unmapAR)
   191  					}
   192  					unmapAR = srcpseg.Range()
   193  				}
   194  				pma.effectivePerms.Write = false
   195  			}
   196  			pma.maxPerms.Write = false
   197  		}
   198  		fr := srcpseg.fileRange()
   199  		// srcpseg.ValuePtr().file == mm.mf since pma.private == true.
   200  		mm.mf.IncRef(fr, memCgID)
   201  		addrRange := srcpseg.Range()
   202  		mm2.addRSSLocked(addrRange)
   203  		dstpgap = mm2.pmas.Insert(dstpgap, addrRange, *pma).NextGap()
   204  	}
   205  	if unmapAR.Length() != 0 {
   206  		mm.unmapASLocked(unmapAR)
   207  	}
   208  
   209  	// Between when we call memmap.Mappable.AddMapping while copying vmas and
   210  	// when we lock mm2.activeMu to copy pmas, calls to mm2.Invalidate() are
   211  	// ineffective because the pmas they invalidate haven't yet been copied,
   212  	// possibly allowing mm2 to get invalidated translations:
   213  	//
   214  	// Invalidating Mappable            mm.Fork
   215  	// ---------------------            -------
   216  	//
   217  	// mm2.Invalidate()
   218  	//                                  mm.activeMu.Lock()
   219  	// mm.Invalidate() /* blocks */
   220  	//                                  mm2.activeMu.Lock()
   221  	//                                  (mm copies invalidated pma to mm2)
   222  	//
   223  	// This would technically be both safe (since we only copy private pmas,
   224  	// which will still hold a reference on their memory) and consistent with
   225  	// Linux, but we avoid it anyway by setting mm2.captureInvalidations during
   226  	// construction, causing calls to mm2.Invalidate() to be captured in
   227  	// mm2.capturedInvalidations, to be replayed after pmas are copied - i.e.
   228  	// here.
   229  	mm2.captureInvalidations = false
   230  	for _, invArgs := range mm2.capturedInvalidations {
   231  		mm2.invalidateLocked(invArgs.ar, invArgs.opts.InvalidatePrivate, true)
   232  	}
   233  	mm2.capturedInvalidations = nil
   234  
   235  	if mm2.executable != nil {
   236  		mm2.executable.IncRef()
   237  	}
   238  	return mm2, nil
   239  }
   240  
   241  // IncUsers increments mm's user count and returns true. If the user count is
   242  // already 0, IncUsers does nothing and returns false.
   243  func (mm *MemoryManager) IncUsers() bool {
   244  	for {
   245  		users := mm.users.Load()
   246  		if users == 0 {
   247  			return false
   248  		}
   249  		if mm.users.CompareAndSwap(users, users+1) {
   250  			return true
   251  		}
   252  	}
   253  }
   254  
   255  // DecUsers decrements mm's user count. If the user count reaches 0, all
   256  // mappings in mm are unmapped.
   257  func (mm *MemoryManager) DecUsers(ctx context.Context) {
   258  	if users := mm.users.Add(-1); users > 0 {
   259  		return
   260  	} else if users < 0 {
   261  		panic(fmt.Sprintf("Invalid MemoryManager.users: %d", users))
   262  	}
   263  
   264  	mm.destroyAIOManager(ctx)
   265  
   266  	mm.metadataMu.Lock()
   267  	exe := mm.executable
   268  	mm.executable = nil
   269  	mm.metadataMu.Unlock()
   270  	if exe != nil {
   271  		exe.DecRef(ctx)
   272  	}
   273  
   274  	mm.activeMu.Lock()
   275  	// Sanity check.
   276  	if mm.active.Load() != 0 {
   277  		panic("active address space lost?")
   278  	}
   279  	// Make sure the AddressSpace is returned.
   280  	if mm.as != nil {
   281  		mm.as.Release()
   282  		mm.as = nil
   283  	}
   284  	mm.activeMu.Unlock()
   285  
   286  	var droppedIDs []memmap.MappingIdentity
   287  	mm.mappingMu.Lock()
   288  	// If mm is being dropped before mm.SetMmapLayout was called,
   289  	// mm.applicationAddrRange() will be empty.
   290  	if ar := mm.applicationAddrRange(); ar.Length() != 0 {
   291  		_, droppedIDs = mm.unmapLocked(ctx, ar, droppedIDs)
   292  	}
   293  	mm.mappingMu.Unlock()
   294  
   295  	for _, id := range droppedIDs {
   296  		id.DecRef(ctx)
   297  	}
   298  }