gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/mm/mm.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package mm provides a memory management subsystem. See README.md for a 16 // detailed overview. 17 // 18 // Lock order: 19 // 20 // fs locks, except for memmap.Mappable locks 21 // mm.MemoryManager.metadataMu 22 // mm.MemoryManager.mappingMu 23 // Locks taken by memmap.MappingIdentity and memmap.Mappable methods other 24 // than Translate 25 // kernel.TaskSet.mu 26 // mm.MemoryManager.activeMu 27 // Locks taken by memmap.Mappable.Translate 28 // platform.AddressSpace locks 29 // memmap.File locks 30 // mm.aioManager.mu 31 // mm.AIOContext.mu 32 // 33 // Only mm.MemoryManager.Fork is permitted to lock mm.MemoryManager.activeMu in 34 // multiple mm.MemoryManagers, as it does so in a well-defined order (forked 35 // child first). 36 package mm 37 38 import ( 39 "sync/atomic" 40 41 "gvisor.dev/gvisor/pkg/abi/linux" 42 "gvisor.dev/gvisor/pkg/atomicbitops" 43 "gvisor.dev/gvisor/pkg/hostarch" 44 "gvisor.dev/gvisor/pkg/safemem" 45 "gvisor.dev/gvisor/pkg/sentry/arch" 46 "gvisor.dev/gvisor/pkg/sentry/memmap" 47 "gvisor.dev/gvisor/pkg/sentry/pgalloc" 48 "gvisor.dev/gvisor/pkg/sentry/platform" 49 "gvisor.dev/gvisor/pkg/sentry/vfs" 50 ) 51 52 // MapsCallbackFunc has all the parameters required for populating an entry of /proc/[pid]/maps. 53 type MapsCallbackFunc func(start, end hostarch.Addr, permissions hostarch.AccessType, private string, offset uint64, devMajor, devMinor uint32, inode uint64, path string) 54 55 // MemoryManager implements a virtual address space. 56 // 57 // +stateify savable 58 type MemoryManager struct { 59 // p and mfp are immutable. 60 p platform.Platform 61 62 // mf is the cached result of mfp.MemoryFile(). 63 // 64 // mf is immutable. 65 mf *pgalloc.MemoryFile `state:"nosave"` 66 67 // haveASIO is the cached result of p.SupportsAddressSpaceIO(). Aside from 68 // eliminating an indirect call in the hot I/O path, this makes 69 // MemoryManager.asioEnabled() a leaf function, allowing it to be inlined. 70 // 71 // haveASIO is immutable. 72 haveASIO bool `state:"nosave"` 73 74 // layout is the memory layout. 75 // 76 // layout is set by the binary loader before the MemoryManager can be used. 77 layout arch.MmapLayout 78 79 // users is the number of dependencies on the mappings in the MemoryManager. 80 // When the number of references in users reaches zero, all mappings are 81 // unmapped. 82 users atomicbitops.Int32 83 84 // mappingMu is analogous to Linux's struct mm_struct::mmap_sem. 85 mappingMu mappingRWMutex `state:"nosave"` 86 87 // vmas stores virtual memory areas. Since vmas are stored by value, 88 // clients should usually use vmaIterator.ValuePtr() instead of 89 // vmaIterator.Value() to get a pointer to the vma rather than a copy. 90 // 91 // Invariants: vmas are always page-aligned. 92 // 93 // vmas is protected by mappingMu. 94 vmas vmaSet 95 96 // brk is the mm's brk, which is manipulated using the brk(2) system call. 97 // The brk is initially set up by the loader which maps an executable 98 // binary into the mm. 99 // 100 // brk is protected by mappingMu. 101 brk hostarch.AddrRange 102 103 // usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks. 104 // 105 // usageAS is protected by mappingMu. 106 usageAS uint64 107 108 // lockedAS is the combined size in bytes of all vmas with vma.mlockMode != 109 // memmap.MLockNone. 110 // 111 // lockedAS is protected by mappingMu. 112 lockedAS uint64 113 114 // dataAS is the size of private data segments, like mm_struct->data_vm. 115 // It means the vma which is private, writable, not stack. 116 // 117 // dataAS is protected by mappingMu. 118 dataAS uint64 119 120 // New VMAs created by MMap use whichever of memmap.MMapOpts.MLockMode or 121 // defMLockMode is greater. 122 // 123 // defMLockMode is protected by mappingMu. 124 defMLockMode memmap.MLockMode 125 126 // activeMu is loosely analogous to Linux's struct 127 // mm_struct::page_table_lock. 128 activeMu activeRWMutex `state:"nosave"` 129 130 // pmas stores platform mapping areas used to implement vmas. Since pmas 131 // are stored by value, clients should usually use pmaIterator.ValuePtr() 132 // instead of pmaIterator.Value() to get a pointer to the pma rather than 133 // a copy. 134 // 135 // Inserting or removing segments from pmas should happen along with a 136 // call to mm.insertRSS or mm.removeRSS. 137 // 138 // Invariants: pmas are always page-aligned. If a pma exists for a given 139 // address, a vma must also exist for that address. 140 // 141 // pmas is protected by activeMu. 142 pmas pmaSet 143 144 // curRSS is pmas.Span(), cached to accelerate updates to maxRSS. It is 145 // reported as the MemoryManager's RSS. 146 // 147 // maxRSS should be modified only via insertRSS and removeRSS, not 148 // directly. 149 // 150 // maxRSS is protected by activeMu. 151 curRSS uint64 152 153 // maxRSS is the maximum resident set size in bytes of a MemoryManager. 154 // It is tracked as the application adds and removes mappings to pmas. 155 // 156 // maxRSS should be modified only via insertRSS, not directly. 157 // 158 // maxRSS is protected by activeMu. 159 maxRSS uint64 160 161 // as is the platform.AddressSpace that pmas are mapped into. active is the 162 // number of contexts that require as to be non-nil; if active == 0, as may 163 // be nil. 164 // 165 // as is protected by activeMu. active is manipulated with atomic memory 166 // operations; transitions to and from zero are additionally protected by 167 // activeMu. (This is because such transitions may need to be atomic with 168 // changes to as.) 169 as platform.AddressSpace `state:"nosave"` 170 active atomicbitops.Int32 `state:"zerovalue"` 171 172 // unmapAllOnActivate indicates that the next Activate call should activate 173 // an empty AddressSpace. 174 // 175 // This is used to ensure that an AddressSpace cached in 176 // NewAddressSpace is not used after some change in the MemoryManager 177 // or VMAs has made that AddressSpace stale. 178 // 179 // unmapAllOnActivate is protected by activeMu. It must only be set when 180 // there is no active or cached AddressSpace. If as != nil, then 181 // invalidations should be propagated immediately. 182 unmapAllOnActivate bool `state:"nosave"` 183 184 // If captureInvalidations is true, calls to MM.Invalidate() are recorded 185 // in capturedInvalidations rather than being applied immediately to pmas. 186 // This is to avoid a race condition in MM.Fork(); see that function for 187 // details. 188 // 189 // Both captureInvalidations and capturedInvalidations are protected by 190 // activeMu. Neither need to be saved since captureInvalidations is only 191 // enabled during MM.Fork(), during which saving can't occur. 192 captureInvalidations bool `state:"zerovalue"` 193 capturedInvalidations []invalidateArgs `state:"nosave"` 194 195 // dumpability describes if and how this MemoryManager may be dumped to 196 // userspace. This is read under kernel.TaskSet.mu, so it can't be protected 197 // by metadataMu. 198 dumpability atomicbitops.Int32 199 200 metadataMu metadataMutex `state:"nosave"` 201 202 // argv is the application argv. This is set up by the loader and may be 203 // modified by prctl(PR_SET_MM_ARG_START/PR_SET_MM_ARG_END). No 204 // requirements apply to argv; we do not require that argv.WellFormed(). 205 // 206 // argv is protected by metadataMu. 207 argv hostarch.AddrRange 208 209 // envv is the application envv. This is set up by the loader and may be 210 // modified by prctl(PR_SET_MM_ENV_START/PR_SET_MM_ENV_END). No 211 // requirements apply to envv; we do not require that envv.WellFormed(). 212 // 213 // envv is protected by metadataMu. 214 envv hostarch.AddrRange 215 216 // auxv is the ELF's auxiliary vector. 217 // 218 // auxv is protected by metadataMu. 219 auxv arch.Auxv 220 221 // executable is the executable for this MemoryManager. If executable 222 // is not nil, it holds a reference on the Dirent. 223 // 224 // executable is protected by metadataMu. 225 executable *vfs.FileDescription 226 227 // aioManager keeps track of AIOContexts used for async IOs. AIOManager 228 // must be cloned when CLONE_VM is used. 229 aioManager aioManager 230 231 // sleepForActivation indicates whether the task should report to be sleeping 232 // before trying to activate the address space. When set to true, delays in 233 // activation are not reported as stuck tasks by the watchdog. 234 sleepForActivation bool 235 236 // vdsoSigReturnAddr is the address of 'vdso_sigreturn'. 237 vdsoSigReturnAddr uint64 238 239 // membarrierPrivateEnabled is non-zero if EnableMembarrierPrivate has 240 // previously been called. Since, as of this writing, 241 // MEMBARRIER_CMD_PRIVATE_EXPEDITED is implemented as a global memory 242 // barrier, membarrierPrivateEnabled has no other effect. 243 membarrierPrivateEnabled atomicbitops.Uint32 244 245 // membarrierRSeqEnabled is non-zero if EnableMembarrierRSeq has previously 246 // been called. 247 membarrierRSeqEnabled atomicbitops.Uint32 248 } 249 250 // vma represents a virtual memory area. 251 // 252 // Note: new fields added to this struct must be added to vma.Copy and 253 // vmaSetFunctions.Merge. 254 // 255 // +stateify savable 256 type vma struct { 257 // mappable is the virtual memory object mapped by this vma. If mappable is 258 // nil, the vma represents an anonymous mapping. 259 mappable memmap.Mappable 260 261 // off is the offset into mappable at which this vma begins. If mappable is 262 // nil, off is meaningless. 263 off uint64 264 265 // To speedup VMA save/restore, we group and save the following booleans 266 // as a single integer. 267 268 // realPerms are the memory permissions on this vma, as defined by the 269 // application. 270 realPerms hostarch.AccessType `state:".(int)"` 271 272 // effectivePerms are the memory permissions on this vma which are 273 // actually used to control access. 274 // 275 // Invariant: effectivePerms == realPerms.Effective(). 276 effectivePerms hostarch.AccessType `state:"manual"` 277 278 // maxPerms limits the set of permissions that may ever apply to this 279 // memory, as well as accesses for which usermem.IOOpts.IgnorePermissions 280 // is true (e.g. ptrace(PTRACE_POKEDATA)). 281 // 282 // Invariant: maxPerms == maxPerms.Effective(). 283 maxPerms hostarch.AccessType `state:"manual"` 284 285 // private is true if this is a MAP_PRIVATE mapping, such that writes to 286 // the mapping are propagated to a copy. 287 private bool `state:"manual"` 288 289 // growsDown is true if the mapping may be automatically extended downward 290 // under certain conditions. If growsDown is true, mappable must be nil. 291 // 292 // There is currently no corresponding growsUp flag; in Linux, the only 293 // architectures that can have VM_GROWSUP mappings are ia64, parisc, and 294 // metag, none of which we currently support. 295 growsDown bool `state:"manual"` 296 297 // dontfork is the MADV_DONTFORK setting for this vma configured by madvise(). 298 dontfork bool 299 300 mlockMode memmap.MLockMode 301 302 // numaPolicy is the NUMA policy for this vma set by mbind(). 303 numaPolicy linux.NumaPolicy 304 305 // numaNodemask is the NUMA nodemask for this vma set by mbind(). 306 numaNodemask uint64 307 308 // If id is not nil, it controls the lifecycle of mappable and provides vma 309 // metadata shown in /proc/[pid]/maps, and the vma holds a reference. 310 id memmap.MappingIdentity 311 312 // If hint is non-empty, it is a description of the vma printed in 313 // /proc/[pid]/maps. hint takes priority over id.MappedName(). 314 hint string 315 316 // lastFault records the last address that was paged faulted. It hints at 317 // which direction addresses in this vma are being accessed. 318 // 319 // This field can be read atomically, and written with mm.activeMu locked for 320 // writing and mm.mapping locked. 321 lastFault uintptr 322 } 323 324 func (v *vma) copy() vma { 325 return vma{ 326 mappable: v.mappable, 327 off: v.off, 328 realPerms: v.realPerms, 329 effectivePerms: v.effectivePerms, 330 maxPerms: v.maxPerms, 331 private: v.private, 332 growsDown: v.growsDown, 333 dontfork: v.dontfork, 334 mlockMode: v.mlockMode, 335 numaPolicy: v.numaPolicy, 336 numaNodemask: v.numaNodemask, 337 id: v.id, 338 hint: v.hint, 339 lastFault: atomic.LoadUintptr(&v.lastFault), 340 } 341 } 342 343 // pma represents a platform mapping area. 344 // 345 // +stateify savable 346 type pma struct { 347 // file is the file mapped by this pma. Only pmas for which file is of type 348 // pgalloc.MemoryFile may be saved. pmas hold a reference to the 349 // corresponding file range while they exist. 350 file memmap.File `state:".(string)"` 351 352 // off is the offset into file at which this pma begins. 353 off uint64 354 355 // translatePerms is the permissions returned by memmap.Mappable.Translate. 356 // If private is true, translatePerms is hostarch.AnyAccess. 357 translatePerms hostarch.AccessType 358 359 // effectivePerms is the permissions allowed for non-ignorePermissions 360 // accesses. maxPerms is the permissions allowed for ignorePermissions 361 // accesses. These are vma.effectivePerms and vma.maxPerms respectively, 362 // masked by pma.translatePerms and with Write disallowed if pma.needCOW is 363 // true. 364 // 365 // These are stored in the pma so that the IO implementation can avoid 366 // iterating mm.vmas when pmas already exist. 367 effectivePerms hostarch.AccessType 368 maxPerms hostarch.AccessType 369 370 // needCOW is true if writes to the mapping must be propagated to a copy. 371 needCOW bool 372 373 // private is true if this pma represents private memory. 374 // 375 // If private is true, file must be MemoryManager.mfp.MemoryFile(), and 376 // calls to Invalidate for which memmap.InvalidateOpts.InvalidatePrivate is 377 // false should ignore the pma. 378 // 379 // If private is false, this pma caches a translation from the 380 // corresponding vma's memmap.Mappable.Translate. 381 private bool 382 383 // If internalMappings is not empty, it is the cached return value of 384 // file.MapInternal for the memmap.FileRange mapped by this pma. 385 internalMappings safemem.BlockSeq `state:"nosave"` 386 } 387 388 type invalidateArgs struct { 389 ar hostarch.AddrRange 390 opts memmap.InvalidateOpts 391 }