github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/mm/lifecycle.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package mm 16 17 import ( 18 "fmt" 19 "sync/atomic" 20 21 "github.com/SagerNet/gvisor/pkg/context" 22 "github.com/SagerNet/gvisor/pkg/hostarch" 23 "github.com/SagerNet/gvisor/pkg/sentry/arch" 24 "github.com/SagerNet/gvisor/pkg/sentry/limits" 25 "github.com/SagerNet/gvisor/pkg/sentry/memmap" 26 "github.com/SagerNet/gvisor/pkg/sentry/pgalloc" 27 "github.com/SagerNet/gvisor/pkg/sentry/platform" 28 ) 29 30 // NewMemoryManager returns a new MemoryManager with no mappings and 1 user. 31 func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider, sleepForActivation bool) *MemoryManager { 32 return &MemoryManager{ 33 p: p, 34 mfp: mfp, 35 haveASIO: p.SupportsAddressSpaceIO(), 36 privateRefs: &privateRefs{}, 37 users: 1, 38 auxv: arch.Auxv{}, 39 dumpability: UserDumpable, 40 aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, 41 sleepForActivation: sleepForActivation, 42 } 43 } 44 45 // SetMmapLayout initializes mm's layout from the given arch.Context. 46 // 47 // Preconditions: mm contains no mappings and is not used concurrently. 48 func (mm *MemoryManager) SetMmapLayout(ac arch.Context, r *limits.LimitSet) (arch.MmapLayout, error) { 49 layout, err := ac.NewMmapLayout(mm.p.MinUserAddress(), mm.p.MaxUserAddress(), r) 50 if err != nil { 51 return arch.MmapLayout{}, err 52 } 53 mm.layout = layout 54 return layout, nil 55 } 56 57 // Fork creates a copy of mm with 1 user, as for Linux syscalls fork() or 58 // clone() (without CLONE_VM). 59 func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { 60 mm.AddressSpace().PreFork() 61 defer mm.AddressSpace().PostFork() 62 mm.metadataMu.Lock() 63 defer mm.metadataMu.Unlock() 64 mm.mappingMu.RLock() 65 defer mm.mappingMu.RUnlock() 66 mm2 := &MemoryManager{ 67 p: mm.p, 68 mfp: mm.mfp, 69 haveASIO: mm.haveASIO, 70 layout: mm.layout, 71 privateRefs: mm.privateRefs, 72 users: 1, 73 brk: mm.brk, 74 usageAS: mm.usageAS, 75 dataAS: mm.dataAS, 76 // "The child does not inherit its parent's memory locks (mlock(2), 77 // mlockall(2))." - fork(2). So lockedAS is 0 and defMLockMode is 78 // MLockNone, both of which are zero values. vma.mlockMode is reset 79 // when copied below. 80 captureInvalidations: true, 81 argv: mm.argv, 82 envv: mm.envv, 83 auxv: append(arch.Auxv(nil), mm.auxv...), 84 // IncRef'd below, once we know that there isn't an error. 85 executable: mm.executable, 86 dumpability: mm.dumpability, 87 aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, 88 sleepForActivation: mm.sleepForActivation, 89 vdsoSigReturnAddr: mm.vdsoSigReturnAddr, 90 } 91 92 // Copy vmas. 93 dontforks := false 94 dstvgap := mm2.vmas.FirstGap() 95 for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() { 96 vma := srcvseg.Value() // makes a copy of the vma 97 vmaAR := srcvseg.Range() 98 99 if vma.dontfork { 100 length := uint64(vmaAR.Length()) 101 mm2.usageAS -= length 102 if vma.isPrivateDataLocked() { 103 mm2.dataAS -= length 104 } 105 dontforks = true 106 continue 107 } 108 109 // Inform the Mappable, if any, of the new mapping. 110 if vma.mappable != nil { 111 if err := vma.mappable.AddMapping(ctx, mm2, vmaAR, vma.off, vma.canWriteMappableLocked()); err != nil { 112 mm2.removeVMAsLocked(ctx, mm2.applicationAddrRange()) 113 return nil, err 114 } 115 } 116 if vma.id != nil { 117 vma.id.IncRef() 118 } 119 vma.mlockMode = memmap.MLockNone 120 dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, vma).NextGap() 121 // We don't need to update mm2.usageAS since we copied it from mm 122 // above. 123 } 124 125 // Copy pmas. We have to lock mm.activeMu for writing to make existing 126 // private pmas copy-on-write. We also have to lock mm2.activeMu since 127 // after copying vmas above, memmap.Mappables may call mm2.Invalidate. We 128 // only copy private pmas, since in the common case where fork(2) is 129 // immediately followed by execve(2), copying non-private pmas that can be 130 // regenerated by calling memmap.Mappable.Translate is a waste of time. 131 // (Linux does the same; compare kernel/fork.c:dup_mmap() => 132 // mm/memory.c:copy_page_range().) 133 mm2.activeMu.Lock() 134 defer mm2.activeMu.Unlock() 135 mm.activeMu.Lock() 136 defer mm.activeMu.Unlock() 137 if dontforks { 138 defer mm.pmas.MergeRange(mm.applicationAddrRange()) 139 } 140 srcvseg := mm.vmas.FirstSegment() 141 dstpgap := mm2.pmas.FirstGap() 142 var unmapAR hostarch.AddrRange 143 for srcpseg := mm.pmas.FirstSegment(); srcpseg.Ok(); srcpseg = srcpseg.NextSegment() { 144 pma := srcpseg.ValuePtr() 145 if !pma.private { 146 continue 147 } 148 149 if dontforks { 150 // Find the 'vma' that contains the starting address 151 // associated with the 'pma' (there must be one). 152 srcvseg = srcvseg.seekNextLowerBound(srcpseg.Start()) 153 if checkInvariants { 154 if !srcvseg.Ok() { 155 panic(fmt.Sprintf("no vma covers pma range %v", srcpseg.Range())) 156 } 157 if srcpseg.Start() < srcvseg.Start() { 158 panic(fmt.Sprintf("vma %v ran ahead of pma %v", srcvseg.Range(), srcpseg.Range())) 159 } 160 } 161 162 srcpseg = mm.pmas.Isolate(srcpseg, srcvseg.Range()) 163 if srcvseg.ValuePtr().dontfork { 164 continue 165 } 166 pma = srcpseg.ValuePtr() 167 } 168 169 if !pma.needCOW { 170 pma.needCOW = true 171 if pma.effectivePerms.Write { 172 // We don't want to unmap the whole address space, even though 173 // doing so would reduce calls to unmapASLocked(), because mm 174 // will most likely continue to be used after the fork, so 175 // unmapping pmas unnecessarily will result in extra page 176 // faults. But we do want to merge consecutive AddrRanges 177 // across pma boundaries. 178 if unmapAR.End == srcpseg.Start() { 179 unmapAR.End = srcpseg.End() 180 } else { 181 if unmapAR.Length() != 0 { 182 mm.unmapASLocked(unmapAR) 183 } 184 unmapAR = srcpseg.Range() 185 } 186 pma.effectivePerms.Write = false 187 } 188 pma.maxPerms.Write = false 189 } 190 fr := srcpseg.fileRange() 191 mm2.incPrivateRef(fr) 192 srcpseg.ValuePtr().file.IncRef(fr) 193 addrRange := srcpseg.Range() 194 mm2.addRSSLocked(addrRange) 195 dstpgap = mm2.pmas.Insert(dstpgap, addrRange, *pma).NextGap() 196 } 197 if unmapAR.Length() != 0 { 198 mm.unmapASLocked(unmapAR) 199 } 200 201 // Between when we call memmap.Mappable.AddMapping while copying vmas and 202 // when we lock mm2.activeMu to copy pmas, calls to mm2.Invalidate() are 203 // ineffective because the pmas they invalidate haven't yet been copied, 204 // possibly allowing mm2 to get invalidated translations: 205 // 206 // Invalidating Mappable mm.Fork 207 // --------------------- ------- 208 // 209 // mm2.Invalidate() 210 // mm.activeMu.Lock() 211 // mm.Invalidate() /* blocks */ 212 // mm2.activeMu.Lock() 213 // (mm copies invalidated pma to mm2) 214 // 215 // This would technically be both safe (since we only copy private pmas, 216 // which will still hold a reference on their memory) and consistent with 217 // Linux, but we avoid it anyway by setting mm2.captureInvalidations during 218 // construction, causing calls to mm2.Invalidate() to be captured in 219 // mm2.capturedInvalidations, to be replayed after pmas are copied - i.e. 220 // here. 221 mm2.captureInvalidations = false 222 for _, invArgs := range mm2.capturedInvalidations { 223 mm2.invalidateLocked(invArgs.ar, invArgs.opts.InvalidatePrivate, true) 224 } 225 mm2.capturedInvalidations = nil 226 227 if mm2.executable != nil { 228 mm2.executable.IncRef() 229 } 230 return mm2, nil 231 } 232 233 // IncUsers increments mm's user count and returns true. If the user count is 234 // already 0, IncUsers does nothing and returns false. 235 func (mm *MemoryManager) IncUsers() bool { 236 for { 237 users := atomic.LoadInt32(&mm.users) 238 if users == 0 { 239 return false 240 } 241 if atomic.CompareAndSwapInt32(&mm.users, users, users+1) { 242 return true 243 } 244 } 245 } 246 247 // DecUsers decrements mm's user count. If the user count reaches 0, all 248 // mappings in mm are unmapped. 249 func (mm *MemoryManager) DecUsers(ctx context.Context) { 250 if users := atomic.AddInt32(&mm.users, -1); users > 0 { 251 return 252 } else if users < 0 { 253 panic(fmt.Sprintf("Invalid MemoryManager.users: %d", users)) 254 } 255 256 mm.destroyAIOManager(ctx) 257 258 mm.metadataMu.Lock() 259 exe := mm.executable 260 mm.executable = nil 261 mm.metadataMu.Unlock() 262 if exe != nil { 263 exe.DecRef(ctx) 264 } 265 266 mm.activeMu.Lock() 267 // Sanity check. 268 if atomic.LoadInt32(&mm.active) != 0 { 269 panic("active address space lost?") 270 } 271 // Make sure the AddressSpace is returned. 272 if mm.as != nil { 273 mm.as.Release() 274 mm.as = nil 275 } 276 mm.activeMu.Unlock() 277 278 mm.mappingMu.Lock() 279 defer mm.mappingMu.Unlock() 280 // If mm is being dropped before mm.SetMmapLayout was called, 281 // mm.applicationAddrRange() will be empty. 282 if ar := mm.applicationAddrRange(); ar.Length() != 0 { 283 mm.unmapLocked(ctx, ar) 284 } 285 }