github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/mm/lifecycle.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package mm 16 17 import ( 18 "fmt" 19 20 "github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops" 21 "github.com/nicocha30/gvisor-ligolo/pkg/context" 22 "github.com/nicocha30/gvisor-ligolo/pkg/hostarch" 23 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/arch" 24 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/limits" 25 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/memmap" 26 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/pgalloc" 27 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/platform" 28 ) 29 30 // NewMemoryManager returns a new MemoryManager with no mappings and 1 user. 31 func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider, sleepForActivation bool) *MemoryManager { 32 return &MemoryManager{ 33 p: p, 34 mfp: mfp, 35 haveASIO: p.SupportsAddressSpaceIO(), 36 privateRefs: &privateRefs{}, 37 users: atomicbitops.FromInt32(1), 38 auxv: arch.Auxv{}, 39 dumpability: atomicbitops.FromInt32(int32(UserDumpable)), 40 aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, 41 sleepForActivation: sleepForActivation, 42 } 43 } 44 45 // SetMmapLayout initializes mm's layout from the given arch.Context64. 46 // 47 // Preconditions: mm contains no mappings and is not used concurrently. 48 func (mm *MemoryManager) SetMmapLayout(ac *arch.Context64, r *limits.LimitSet) (arch.MmapLayout, error) { 49 layout, err := ac.NewMmapLayout(mm.p.MinUserAddress(), mm.p.MaxUserAddress(), r) 50 if err != nil { 51 return arch.MmapLayout{}, err 52 } 53 mm.layout = layout 54 return layout, nil 55 } 56 57 // Fork creates a copy of mm with 1 user, as for Linux syscalls fork() or 58 // clone() (without CLONE_VM). 59 func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { 60 mm.AddressSpace().PreFork() 61 defer mm.AddressSpace().PostFork() 62 mm.metadataMu.Lock() 63 defer mm.metadataMu.Unlock() 64 65 var droppedIDs []memmap.MappingIdentity 66 // This must run after {mm,mm2}.mappingMu.Unlock(). 67 defer func() { 68 for _, id := range droppedIDs { 69 id.DecRef(ctx) 70 } 71 }() 72 73 mm.mappingMu.RLock() 74 defer mm.mappingMu.RUnlock() 75 mm2 := &MemoryManager{ 76 p: mm.p, 77 mfp: mm.mfp, 78 haveASIO: mm.haveASIO, 79 layout: mm.layout, 80 privateRefs: mm.privateRefs, 81 users: atomicbitops.FromInt32(1), 82 brk: mm.brk, 83 usageAS: mm.usageAS, 84 dataAS: mm.dataAS, 85 // "The child does not inherit its parent's memory locks (mlock(2), 86 // mlockall(2))." - fork(2). So lockedAS is 0 and defMLockMode is 87 // MLockNone, both of which are zero values. vma.mlockMode is reset 88 // when copied below. 89 captureInvalidations: true, 90 argv: mm.argv, 91 envv: mm.envv, 92 auxv: append(arch.Auxv(nil), mm.auxv...), 93 // IncRef'd below, once we know that there isn't an error. 94 executable: mm.executable, 95 dumpability: atomicbitops.FromInt32(mm.dumpability.Load()), 96 aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, 97 sleepForActivation: mm.sleepForActivation, 98 vdsoSigReturnAddr: mm.vdsoSigReturnAddr, 99 } 100 101 // Copy vmas. 102 dontforks := false 103 dstvgap := mm2.vmas.FirstGap() 104 for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() { 105 vma := srcvseg.ValuePtr().copy() 106 vmaAR := srcvseg.Range() 107 108 if vma.dontfork { 109 length := uint64(vmaAR.Length()) 110 mm2.usageAS -= length 111 if vma.isPrivateDataLocked() { 112 mm2.dataAS -= length 113 } 114 dontforks = true 115 continue 116 } 117 118 // Inform the Mappable, if any, of the new mapping. 119 if vma.mappable != nil { 120 if err := vma.mappable.AddMapping(ctx, mm2, vmaAR, vma.off, vma.canWriteMappableLocked()); err != nil { 121 _, droppedIDs = mm2.removeVMAsLocked(ctx, mm2.applicationAddrRange(), droppedIDs) 122 return nil, err 123 } 124 } 125 if vma.id != nil { 126 vma.id.IncRef() 127 } 128 vma.mlockMode = memmap.MLockNone 129 dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, vma).NextGap() 130 // We don't need to update mm2.usageAS since we copied it from mm 131 // above. 132 } 133 134 // Copy pmas. We have to lock mm.activeMu for writing to make existing 135 // private pmas copy-on-write. We also have to lock mm2.activeMu since 136 // after copying vmas above, memmap.Mappables may call mm2.Invalidate. We 137 // only copy private pmas, since in the common case where fork(2) is 138 // immediately followed by execve(2), copying non-private pmas that can be 139 // regenerated by calling memmap.Mappable.Translate is a waste of time. 140 // (Linux does the same; compare kernel/fork.c:dup_mmap() => 141 // mm/memory.c:copy_page_range().) 142 mm.activeMu.Lock() 143 defer mm.activeMu.Unlock() 144 mm2.activeMu.NestedLock(activeLockForked) 145 defer mm2.activeMu.NestedUnlock(activeLockForked) 146 if dontforks { 147 defer mm.pmas.MergeRange(mm.applicationAddrRange()) 148 } 149 srcvseg := mm.vmas.FirstSegment() 150 dstpgap := mm2.pmas.FirstGap() 151 var unmapAR hostarch.AddrRange 152 memCgID := pgalloc.MemoryCgroupIDFromContext(ctx) 153 for srcpseg := mm.pmas.FirstSegment(); srcpseg.Ok(); srcpseg = srcpseg.NextSegment() { 154 pma := srcpseg.ValuePtr() 155 if !pma.private { 156 continue 157 } 158 159 if dontforks { 160 // Find the 'vma' that contains the starting address 161 // associated with the 'pma' (there must be one). 162 srcvseg = srcvseg.seekNextLowerBound(srcpseg.Start()) 163 if checkInvariants { 164 if !srcvseg.Ok() { 165 panic(fmt.Sprintf("no vma covers pma range %v", srcpseg.Range())) 166 } 167 if srcpseg.Start() < srcvseg.Start() { 168 panic(fmt.Sprintf("vma %v ran ahead of pma %v", srcvseg.Range(), srcpseg.Range())) 169 } 170 } 171 172 srcpseg = mm.pmas.Isolate(srcpseg, srcvseg.Range()) 173 if srcvseg.ValuePtr().dontfork { 174 continue 175 } 176 pma = srcpseg.ValuePtr() 177 } 178 179 if !pma.needCOW { 180 pma.needCOW = true 181 if pma.effectivePerms.Write { 182 // We don't want to unmap the whole address space, even though 183 // doing so would reduce calls to unmapASLocked(), because mm 184 // will most likely continue to be used after the fork, so 185 // unmapping pmas unnecessarily will result in extra page 186 // faults. But we do want to merge consecutive AddrRanges 187 // across pma boundaries. 188 if unmapAR.End == srcpseg.Start() { 189 unmapAR.End = srcpseg.End() 190 } else { 191 if unmapAR.Length() != 0 { 192 mm.unmapASLocked(unmapAR) 193 } 194 unmapAR = srcpseg.Range() 195 } 196 pma.effectivePerms.Write = false 197 } 198 pma.maxPerms.Write = false 199 } 200 fr := srcpseg.fileRange() 201 mm2.incPrivateRef(fr) 202 srcpseg.ValuePtr().file.IncRef(fr, memCgID) 203 addrRange := srcpseg.Range() 204 mm2.addRSSLocked(addrRange) 205 dstpgap = mm2.pmas.Insert(dstpgap, addrRange, *pma).NextGap() 206 } 207 if unmapAR.Length() != 0 { 208 mm.unmapASLocked(unmapAR) 209 } 210 211 // Between when we call memmap.Mappable.AddMapping while copying vmas and 212 // when we lock mm2.activeMu to copy pmas, calls to mm2.Invalidate() are 213 // ineffective because the pmas they invalidate haven't yet been copied, 214 // possibly allowing mm2 to get invalidated translations: 215 // 216 // Invalidating Mappable mm.Fork 217 // --------------------- ------- 218 // 219 // mm2.Invalidate() 220 // mm.activeMu.Lock() 221 // mm.Invalidate() /* blocks */ 222 // mm2.activeMu.Lock() 223 // (mm copies invalidated pma to mm2) 224 // 225 // This would technically be both safe (since we only copy private pmas, 226 // which will still hold a reference on their memory) and consistent with 227 // Linux, but we avoid it anyway by setting mm2.captureInvalidations during 228 // construction, causing calls to mm2.Invalidate() to be captured in 229 // mm2.capturedInvalidations, to be replayed after pmas are copied - i.e. 230 // here. 231 mm2.captureInvalidations = false 232 for _, invArgs := range mm2.capturedInvalidations { 233 mm2.invalidateLocked(invArgs.ar, invArgs.opts.InvalidatePrivate, true) 234 } 235 mm2.capturedInvalidations = nil 236 237 if mm2.executable != nil { 238 mm2.executable.IncRef() 239 } 240 return mm2, nil 241 } 242 243 // IncUsers increments mm's user count and returns true. If the user count is 244 // already 0, IncUsers does nothing and returns false. 245 func (mm *MemoryManager) IncUsers() bool { 246 for { 247 users := mm.users.Load() 248 if users == 0 { 249 return false 250 } 251 if mm.users.CompareAndSwap(users, users+1) { 252 return true 253 } 254 } 255 } 256 257 // DecUsers decrements mm's user count. If the user count reaches 0, all 258 // mappings in mm are unmapped. 259 func (mm *MemoryManager) DecUsers(ctx context.Context) { 260 if users := mm.users.Add(-1); users > 0 { 261 return 262 } else if users < 0 { 263 panic(fmt.Sprintf("Invalid MemoryManager.users: %d", users)) 264 } 265 266 mm.destroyAIOManager(ctx) 267 268 mm.metadataMu.Lock() 269 exe := mm.executable 270 mm.executable = nil 271 mm.metadataMu.Unlock() 272 if exe != nil { 273 exe.DecRef(ctx) 274 } 275 276 mm.activeMu.Lock() 277 // Sanity check. 278 if mm.active.Load() != 0 { 279 panic("active address space lost?") 280 } 281 // Make sure the AddressSpace is returned. 282 if mm.as != nil { 283 mm.as.Release() 284 mm.as = nil 285 } 286 mm.activeMu.Unlock() 287 288 var droppedIDs []memmap.MappingIdentity 289 mm.mappingMu.Lock() 290 // If mm is being dropped before mm.SetMmapLayout was called, 291 // mm.applicationAddrRange() will be empty. 292 if ar := mm.applicationAddrRange(); ar.Length() != 0 { 293 _, droppedIDs = mm.unmapLocked(ctx, ar, droppedIDs) 294 } 295 mm.mappingMu.Unlock() 296 297 for _, id := range droppedIDs { 298 id.DecRef(ctx) 299 } 300 }