github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/mm/lifecycle.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package mm
    16  
    17  import (
    18  	"fmt"
    19  	"sync/atomic"
    20  
    21  	"github.com/SagerNet/gvisor/pkg/context"
    22  	"github.com/SagerNet/gvisor/pkg/hostarch"
    23  	"github.com/SagerNet/gvisor/pkg/sentry/arch"
    24  	"github.com/SagerNet/gvisor/pkg/sentry/limits"
    25  	"github.com/SagerNet/gvisor/pkg/sentry/memmap"
    26  	"github.com/SagerNet/gvisor/pkg/sentry/pgalloc"
    27  	"github.com/SagerNet/gvisor/pkg/sentry/platform"
    28  )
    29  
    30  // NewMemoryManager returns a new MemoryManager with no mappings and 1 user.
    31  func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider, sleepForActivation bool) *MemoryManager {
    32  	return &MemoryManager{
    33  		p:                  p,
    34  		mfp:                mfp,
    35  		haveASIO:           p.SupportsAddressSpaceIO(),
    36  		privateRefs:        &privateRefs{},
    37  		users:              1,
    38  		auxv:               arch.Auxv{},
    39  		dumpability:        UserDumpable,
    40  		aioManager:         aioManager{contexts: make(map[uint64]*AIOContext)},
    41  		sleepForActivation: sleepForActivation,
    42  	}
    43  }
    44  
    45  // SetMmapLayout initializes mm's layout from the given arch.Context.
    46  //
    47  // Preconditions: mm contains no mappings and is not used concurrently.
    48  func (mm *MemoryManager) SetMmapLayout(ac arch.Context, r *limits.LimitSet) (arch.MmapLayout, error) {
    49  	layout, err := ac.NewMmapLayout(mm.p.MinUserAddress(), mm.p.MaxUserAddress(), r)
    50  	if err != nil {
    51  		return arch.MmapLayout{}, err
    52  	}
    53  	mm.layout = layout
    54  	return layout, nil
    55  }
    56  
    57  // Fork creates a copy of mm with 1 user, as for Linux syscalls fork() or
    58  // clone() (without CLONE_VM).
    59  func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
    60  	mm.AddressSpace().PreFork()
    61  	defer mm.AddressSpace().PostFork()
    62  	mm.metadataMu.Lock()
    63  	defer mm.metadataMu.Unlock()
    64  	mm.mappingMu.RLock()
    65  	defer mm.mappingMu.RUnlock()
    66  	mm2 := &MemoryManager{
    67  		p:           mm.p,
    68  		mfp:         mm.mfp,
    69  		haveASIO:    mm.haveASIO,
    70  		layout:      mm.layout,
    71  		privateRefs: mm.privateRefs,
    72  		users:       1,
    73  		brk:         mm.brk,
    74  		usageAS:     mm.usageAS,
    75  		dataAS:      mm.dataAS,
    76  		// "The child does not inherit its parent's memory locks (mlock(2),
    77  		// mlockall(2))." - fork(2). So lockedAS is 0 and defMLockMode is
    78  		// MLockNone, both of which are zero values. vma.mlockMode is reset
    79  		// when copied below.
    80  		captureInvalidations: true,
    81  		argv:                 mm.argv,
    82  		envv:                 mm.envv,
    83  		auxv:                 append(arch.Auxv(nil), mm.auxv...),
    84  		// IncRef'd below, once we know that there isn't an error.
    85  		executable:         mm.executable,
    86  		dumpability:        mm.dumpability,
    87  		aioManager:         aioManager{contexts: make(map[uint64]*AIOContext)},
    88  		sleepForActivation: mm.sleepForActivation,
    89  		vdsoSigReturnAddr:  mm.vdsoSigReturnAddr,
    90  	}
    91  
    92  	// Copy vmas.
    93  	dontforks := false
    94  	dstvgap := mm2.vmas.FirstGap()
    95  	for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() {
    96  		vma := srcvseg.Value() // makes a copy of the vma
    97  		vmaAR := srcvseg.Range()
    98  
    99  		if vma.dontfork {
   100  			length := uint64(vmaAR.Length())
   101  			mm2.usageAS -= length
   102  			if vma.isPrivateDataLocked() {
   103  				mm2.dataAS -= length
   104  			}
   105  			dontforks = true
   106  			continue
   107  		}
   108  
   109  		// Inform the Mappable, if any, of the new mapping.
   110  		if vma.mappable != nil {
   111  			if err := vma.mappable.AddMapping(ctx, mm2, vmaAR, vma.off, vma.canWriteMappableLocked()); err != nil {
   112  				mm2.removeVMAsLocked(ctx, mm2.applicationAddrRange())
   113  				return nil, err
   114  			}
   115  		}
   116  		if vma.id != nil {
   117  			vma.id.IncRef()
   118  		}
   119  		vma.mlockMode = memmap.MLockNone
   120  		dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, vma).NextGap()
   121  		// We don't need to update mm2.usageAS since we copied it from mm
   122  		// above.
   123  	}
   124  
   125  	// Copy pmas. We have to lock mm.activeMu for writing to make existing
   126  	// private pmas copy-on-write. We also have to lock mm2.activeMu since
   127  	// after copying vmas above, memmap.Mappables may call mm2.Invalidate. We
   128  	// only copy private pmas, since in the common case where fork(2) is
   129  	// immediately followed by execve(2), copying non-private pmas that can be
   130  	// regenerated by calling memmap.Mappable.Translate is a waste of time.
   131  	// (Linux does the same; compare kernel/fork.c:dup_mmap() =>
   132  	// mm/memory.c:copy_page_range().)
   133  	mm2.activeMu.Lock()
   134  	defer mm2.activeMu.Unlock()
   135  	mm.activeMu.Lock()
   136  	defer mm.activeMu.Unlock()
   137  	if dontforks {
   138  		defer mm.pmas.MergeRange(mm.applicationAddrRange())
   139  	}
   140  	srcvseg := mm.vmas.FirstSegment()
   141  	dstpgap := mm2.pmas.FirstGap()
   142  	var unmapAR hostarch.AddrRange
   143  	for srcpseg := mm.pmas.FirstSegment(); srcpseg.Ok(); srcpseg = srcpseg.NextSegment() {
   144  		pma := srcpseg.ValuePtr()
   145  		if !pma.private {
   146  			continue
   147  		}
   148  
   149  		if dontforks {
   150  			// Find the 'vma' that contains the starting address
   151  			// associated with the 'pma' (there must be one).
   152  			srcvseg = srcvseg.seekNextLowerBound(srcpseg.Start())
   153  			if checkInvariants {
   154  				if !srcvseg.Ok() {
   155  					panic(fmt.Sprintf("no vma covers pma range %v", srcpseg.Range()))
   156  				}
   157  				if srcpseg.Start() < srcvseg.Start() {
   158  					panic(fmt.Sprintf("vma %v ran ahead of pma %v", srcvseg.Range(), srcpseg.Range()))
   159  				}
   160  			}
   161  
   162  			srcpseg = mm.pmas.Isolate(srcpseg, srcvseg.Range())
   163  			if srcvseg.ValuePtr().dontfork {
   164  				continue
   165  			}
   166  			pma = srcpseg.ValuePtr()
   167  		}
   168  
   169  		if !pma.needCOW {
   170  			pma.needCOW = true
   171  			if pma.effectivePerms.Write {
   172  				// We don't want to unmap the whole address space, even though
   173  				// doing so would reduce calls to unmapASLocked(), because mm
   174  				// will most likely continue to be used after the fork, so
   175  				// unmapping pmas unnecessarily will result in extra page
   176  				// faults. But we do want to merge consecutive AddrRanges
   177  				// across pma boundaries.
   178  				if unmapAR.End == srcpseg.Start() {
   179  					unmapAR.End = srcpseg.End()
   180  				} else {
   181  					if unmapAR.Length() != 0 {
   182  						mm.unmapASLocked(unmapAR)
   183  					}
   184  					unmapAR = srcpseg.Range()
   185  				}
   186  				pma.effectivePerms.Write = false
   187  			}
   188  			pma.maxPerms.Write = false
   189  		}
   190  		fr := srcpseg.fileRange()
   191  		mm2.incPrivateRef(fr)
   192  		srcpseg.ValuePtr().file.IncRef(fr)
   193  		addrRange := srcpseg.Range()
   194  		mm2.addRSSLocked(addrRange)
   195  		dstpgap = mm2.pmas.Insert(dstpgap, addrRange, *pma).NextGap()
   196  	}
   197  	if unmapAR.Length() != 0 {
   198  		mm.unmapASLocked(unmapAR)
   199  	}
   200  
   201  	// Between when we call memmap.Mappable.AddMapping while copying vmas and
   202  	// when we lock mm2.activeMu to copy pmas, calls to mm2.Invalidate() are
   203  	// ineffective because the pmas they invalidate haven't yet been copied,
   204  	// possibly allowing mm2 to get invalidated translations:
   205  	//
   206  	// Invalidating Mappable            mm.Fork
   207  	// ---------------------            -------
   208  	//
   209  	// mm2.Invalidate()
   210  	//                                  mm.activeMu.Lock()
   211  	// mm.Invalidate() /* blocks */
   212  	//                                  mm2.activeMu.Lock()
   213  	//                                  (mm copies invalidated pma to mm2)
   214  	//
   215  	// This would technically be both safe (since we only copy private pmas,
   216  	// which will still hold a reference on their memory) and consistent with
   217  	// Linux, but we avoid it anyway by setting mm2.captureInvalidations during
   218  	// construction, causing calls to mm2.Invalidate() to be captured in
   219  	// mm2.capturedInvalidations, to be replayed after pmas are copied - i.e.
   220  	// here.
   221  	mm2.captureInvalidations = false
   222  	for _, invArgs := range mm2.capturedInvalidations {
   223  		mm2.invalidateLocked(invArgs.ar, invArgs.opts.InvalidatePrivate, true)
   224  	}
   225  	mm2.capturedInvalidations = nil
   226  
   227  	if mm2.executable != nil {
   228  		mm2.executable.IncRef()
   229  	}
   230  	return mm2, nil
   231  }
   232  
   233  // IncUsers increments mm's user count and returns true. If the user count is
   234  // already 0, IncUsers does nothing and returns false.
   235  func (mm *MemoryManager) IncUsers() bool {
   236  	for {
   237  		users := atomic.LoadInt32(&mm.users)
   238  		if users == 0 {
   239  			return false
   240  		}
   241  		if atomic.CompareAndSwapInt32(&mm.users, users, users+1) {
   242  			return true
   243  		}
   244  	}
   245  }
   246  
   247  // DecUsers decrements mm's user count. If the user count reaches 0, all
   248  // mappings in mm are unmapped.
   249  func (mm *MemoryManager) DecUsers(ctx context.Context) {
   250  	if users := atomic.AddInt32(&mm.users, -1); users > 0 {
   251  		return
   252  	} else if users < 0 {
   253  		panic(fmt.Sprintf("Invalid MemoryManager.users: %d", users))
   254  	}
   255  
   256  	mm.destroyAIOManager(ctx)
   257  
   258  	mm.metadataMu.Lock()
   259  	exe := mm.executable
   260  	mm.executable = nil
   261  	mm.metadataMu.Unlock()
   262  	if exe != nil {
   263  		exe.DecRef(ctx)
   264  	}
   265  
   266  	mm.activeMu.Lock()
   267  	// Sanity check.
   268  	if atomic.LoadInt32(&mm.active) != 0 {
   269  		panic("active address space lost?")
   270  	}
   271  	// Make sure the AddressSpace is returned.
   272  	if mm.as != nil {
   273  		mm.as.Release()
   274  		mm.as = nil
   275  	}
   276  	mm.activeMu.Unlock()
   277  
   278  	mm.mappingMu.Lock()
   279  	defer mm.mappingMu.Unlock()
   280  	// If mm is being dropped before mm.SetMmapLayout was called,
   281  	// mm.applicationAddrRange() will be empty.
   282  	if ar := mm.applicationAddrRange(); ar.Length() != 0 {
   283  		mm.unmapLocked(ctx, ar)
   284  	}
   285  }