gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/mm/lifecycle.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package mm 16 17 import ( 18 "fmt" 19 20 "gvisor.dev/gvisor/pkg/atomicbitops" 21 "gvisor.dev/gvisor/pkg/context" 22 "gvisor.dev/gvisor/pkg/hostarch" 23 "gvisor.dev/gvisor/pkg/sentry/arch" 24 "gvisor.dev/gvisor/pkg/sentry/limits" 25 "gvisor.dev/gvisor/pkg/sentry/memmap" 26 "gvisor.dev/gvisor/pkg/sentry/pgalloc" 27 "gvisor.dev/gvisor/pkg/sentry/platform" 28 ) 29 30 // NewMemoryManager returns a new MemoryManager with no mappings and 1 user. 31 func NewMemoryManager(p platform.Platform, mf *pgalloc.MemoryFile, sleepForActivation bool) *MemoryManager { 32 return &MemoryManager{ 33 p: p, 34 mf: mf, 35 haveASIO: p.SupportsAddressSpaceIO(), 36 users: atomicbitops.FromInt32(1), 37 auxv: arch.Auxv{}, 38 dumpability: atomicbitops.FromInt32(int32(UserDumpable)), 39 aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, 40 sleepForActivation: sleepForActivation, 41 } 42 } 43 44 // SetMmapLayout initializes mm's layout from the given arch.Context64. 45 // 46 // Preconditions: mm contains no mappings and is not used concurrently. 47 func (mm *MemoryManager) SetMmapLayout(ac *arch.Context64, r *limits.LimitSet) (arch.MmapLayout, error) { 48 layout, err := ac.NewMmapLayout(mm.p.MinUserAddress(), mm.p.MaxUserAddress(), r) 49 if err != nil { 50 return arch.MmapLayout{}, err 51 } 52 mm.layout = layout 53 return layout, nil 54 } 55 56 // Fork creates a copy of mm with 1 user, as for Linux syscalls fork() or 57 // clone() (without CLONE_VM). 58 func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { 59 mm.AddressSpace().PreFork() 60 defer mm.AddressSpace().PostFork() 61 mm.metadataMu.Lock() 62 defer mm.metadataMu.Unlock() 63 64 var droppedIDs []memmap.MappingIdentity 65 // This must run after {mm,mm2}.mappingMu.Unlock(). 66 defer func() { 67 for _, id := range droppedIDs { 68 id.DecRef(ctx) 69 } 70 }() 71 72 mm.mappingMu.RLock() 73 defer mm.mappingMu.RUnlock() 74 mm2 := &MemoryManager{ 75 p: mm.p, 76 mf: mm.mf, 77 haveASIO: mm.haveASIO, 78 layout: mm.layout, 79 users: atomicbitops.FromInt32(1), 80 brk: mm.brk, 81 usageAS: mm.usageAS, 82 dataAS: mm.dataAS, 83 // "The child does not inherit its parent's memory locks (mlock(2), 84 // mlockall(2))." - fork(2). So lockedAS is 0 and defMLockMode is 85 // MLockNone, both of which are zero values. vma.mlockMode is reset 86 // when copied below. 87 captureInvalidations: true, 88 argv: mm.argv, 89 envv: mm.envv, 90 auxv: append(arch.Auxv(nil), mm.auxv...), 91 // IncRef'd below, once we know that there isn't an error. 92 executable: mm.executable, 93 dumpability: atomicbitops.FromInt32(mm.dumpability.Load()), 94 aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, 95 sleepForActivation: mm.sleepForActivation, 96 vdsoSigReturnAddr: mm.vdsoSigReturnAddr, 97 } 98 99 // Copy vmas. 100 dontforks := false 101 dstvgap := mm2.vmas.FirstGap() 102 for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() { 103 vma := srcvseg.ValuePtr().copy() 104 vmaAR := srcvseg.Range() 105 106 if vma.dontfork { 107 length := uint64(vmaAR.Length()) 108 mm2.usageAS -= length 109 if vma.isPrivateDataLocked() { 110 mm2.dataAS -= length 111 } 112 dontforks = true 113 continue 114 } 115 116 // Inform the Mappable, if any, of the new mapping. 117 if vma.mappable != nil { 118 if err := vma.mappable.AddMapping(ctx, mm2, vmaAR, vma.off, vma.canWriteMappableLocked()); err != nil { 119 _, droppedIDs = mm2.removeVMAsLocked(ctx, mm2.applicationAddrRange(), droppedIDs) 120 return nil, err 121 } 122 } 123 if vma.id != nil { 124 vma.id.IncRef() 125 } 126 vma.mlockMode = memmap.MLockNone 127 dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, vma).NextGap() 128 // We don't need to update mm2.usageAS since we copied it from mm 129 // above. 130 } 131 132 // Copy pmas. We have to lock mm.activeMu for writing to make existing 133 // private pmas copy-on-write. We also have to lock mm2.activeMu since 134 // after copying vmas above, memmap.Mappables may call mm2.Invalidate. We 135 // only copy private pmas, since in the common case where fork(2) is 136 // immediately followed by execve(2), copying non-private pmas that can be 137 // regenerated by calling memmap.Mappable.Translate is a waste of time. 138 // (Linux does the same; compare kernel/fork.c:dup_mmap() => 139 // mm/memory.c:copy_page_range().) 140 mm.activeMu.Lock() 141 defer mm.activeMu.Unlock() 142 mm2.activeMu.NestedLock(activeLockForked) 143 defer mm2.activeMu.NestedUnlock(activeLockForked) 144 if dontforks { 145 defer mm.pmas.MergeInsideRange(mm.applicationAddrRange()) 146 } 147 srcvseg := mm.vmas.FirstSegment() 148 dstpgap := mm2.pmas.FirstGap() 149 var unmapAR hostarch.AddrRange 150 memCgID := pgalloc.MemoryCgroupIDFromContext(ctx) 151 for srcpseg := mm.pmas.FirstSegment(); srcpseg.Ok(); srcpseg = srcpseg.NextSegment() { 152 pma := srcpseg.ValuePtr() 153 if !pma.private { 154 continue 155 } 156 157 if dontforks { 158 // Find the 'vma' that contains the starting address 159 // associated with the 'pma' (there must be one). 160 srcvseg = srcvseg.seekNextLowerBound(srcpseg.Start()) 161 if checkInvariants { 162 if !srcvseg.Ok() { 163 panic(fmt.Sprintf("no vma covers pma range %v", srcpseg.Range())) 164 } 165 if srcpseg.Start() < srcvseg.Start() { 166 panic(fmt.Sprintf("vma %v ran ahead of pma %v", srcvseg.Range(), srcpseg.Range())) 167 } 168 } 169 170 srcpseg = mm.pmas.Isolate(srcpseg, srcvseg.Range()) 171 if srcvseg.ValuePtr().dontfork { 172 continue 173 } 174 pma = srcpseg.ValuePtr() 175 } 176 177 if !pma.needCOW { 178 pma.needCOW = true 179 if pma.effectivePerms.Write { 180 // We don't want to unmap the whole address space, even though 181 // doing so would reduce calls to unmapASLocked(), because mm 182 // will most likely continue to be used after the fork, so 183 // unmapping pmas unnecessarily will result in extra page 184 // faults. But we do want to merge consecutive AddrRanges 185 // across pma boundaries. 186 if unmapAR.End == srcpseg.Start() { 187 unmapAR.End = srcpseg.End() 188 } else { 189 if unmapAR.Length() != 0 { 190 mm.unmapASLocked(unmapAR) 191 } 192 unmapAR = srcpseg.Range() 193 } 194 pma.effectivePerms.Write = false 195 } 196 pma.maxPerms.Write = false 197 } 198 fr := srcpseg.fileRange() 199 // srcpseg.ValuePtr().file == mm.mf since pma.private == true. 200 mm.mf.IncRef(fr, memCgID) 201 addrRange := srcpseg.Range() 202 mm2.addRSSLocked(addrRange) 203 dstpgap = mm2.pmas.Insert(dstpgap, addrRange, *pma).NextGap() 204 } 205 if unmapAR.Length() != 0 { 206 mm.unmapASLocked(unmapAR) 207 } 208 209 // Between when we call memmap.Mappable.AddMapping while copying vmas and 210 // when we lock mm2.activeMu to copy pmas, calls to mm2.Invalidate() are 211 // ineffective because the pmas they invalidate haven't yet been copied, 212 // possibly allowing mm2 to get invalidated translations: 213 // 214 // Invalidating Mappable mm.Fork 215 // --------------------- ------- 216 // 217 // mm2.Invalidate() 218 // mm.activeMu.Lock() 219 // mm.Invalidate() /* blocks */ 220 // mm2.activeMu.Lock() 221 // (mm copies invalidated pma to mm2) 222 // 223 // This would technically be both safe (since we only copy private pmas, 224 // which will still hold a reference on their memory) and consistent with 225 // Linux, but we avoid it anyway by setting mm2.captureInvalidations during 226 // construction, causing calls to mm2.Invalidate() to be captured in 227 // mm2.capturedInvalidations, to be replayed after pmas are copied - i.e. 228 // here. 229 mm2.captureInvalidations = false 230 for _, invArgs := range mm2.capturedInvalidations { 231 mm2.invalidateLocked(invArgs.ar, invArgs.opts.InvalidatePrivate, true) 232 } 233 mm2.capturedInvalidations = nil 234 235 if mm2.executable != nil { 236 mm2.executable.IncRef() 237 } 238 return mm2, nil 239 } 240 241 // IncUsers increments mm's user count and returns true. If the user count is 242 // already 0, IncUsers does nothing and returns false. 243 func (mm *MemoryManager) IncUsers() bool { 244 for { 245 users := mm.users.Load() 246 if users == 0 { 247 return false 248 } 249 if mm.users.CompareAndSwap(users, users+1) { 250 return true 251 } 252 } 253 } 254 255 // DecUsers decrements mm's user count. If the user count reaches 0, all 256 // mappings in mm are unmapped. 257 func (mm *MemoryManager) DecUsers(ctx context.Context) { 258 if users := mm.users.Add(-1); users > 0 { 259 return 260 } else if users < 0 { 261 panic(fmt.Sprintf("Invalid MemoryManager.users: %d", users)) 262 } 263 264 mm.destroyAIOManager(ctx) 265 266 mm.metadataMu.Lock() 267 exe := mm.executable 268 mm.executable = nil 269 mm.metadataMu.Unlock() 270 if exe != nil { 271 exe.DecRef(ctx) 272 } 273 274 mm.activeMu.Lock() 275 // Sanity check. 276 if mm.active.Load() != 0 { 277 panic("active address space lost?") 278 } 279 // Make sure the AddressSpace is returned. 280 if mm.as != nil { 281 mm.as.Release() 282 mm.as = nil 283 } 284 mm.activeMu.Unlock() 285 286 var droppedIDs []memmap.MappingIdentity 287 mm.mappingMu.Lock() 288 // If mm is being dropped before mm.SetMmapLayout was called, 289 // mm.applicationAddrRange() will be empty. 290 if ar := mm.applicationAddrRange(); ar.Length() != 0 { 291 _, droppedIDs = mm.unmapLocked(ctx, ar, droppedIDs) 292 } 293 mm.mappingMu.Unlock() 294 295 for _, id := range droppedIDs { 296 id.DecRef(ctx) 297 } 298 }