github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/mm/mm.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package mm provides a memory management subsystem. See README.md for a 16 // detailed overview. 17 // 18 // Lock order: 19 // 20 // fs locks, except for memmap.Mappable locks 21 // mm.MemoryManager.metadataMu 22 // mm.MemoryManager.mappingMu 23 // Locks taken by memmap.Mappable methods other than Translate 24 // mm.MemoryManager.activeMu 25 // Locks taken by memmap.Mappable.Translate 26 // mm.privateRefs.mu 27 // platform.AddressSpace locks 28 // memmap.File locks 29 // mm.aioManager.mu 30 // mm.AIOContext.mu 31 // 32 // Only mm.MemoryManager.Fork is permitted to lock mm.MemoryManager.activeMu in 33 // multiple mm.MemoryManagers, as it does so in a well-defined order (forked 34 // child first). 35 package mm 36 37 import ( 38 "github.com/SagerNet/gvisor/pkg/abi/linux" 39 "github.com/SagerNet/gvisor/pkg/hostarch" 40 "github.com/SagerNet/gvisor/pkg/safemem" 41 "github.com/SagerNet/gvisor/pkg/sentry/arch" 42 "github.com/SagerNet/gvisor/pkg/sentry/fsbridge" 43 "github.com/SagerNet/gvisor/pkg/sentry/memmap" 44 "github.com/SagerNet/gvisor/pkg/sentry/pgalloc" 45 "github.com/SagerNet/gvisor/pkg/sentry/platform" 46 "github.com/SagerNet/gvisor/pkg/sync" 47 ) 48 49 // MemoryManager implements a virtual address space. 50 // 51 // +stateify savable 52 type MemoryManager struct { 53 // p and mfp are immutable. 54 p platform.Platform 55 mfp pgalloc.MemoryFileProvider 56 57 // haveASIO is the cached result of p.SupportsAddressSpaceIO(). Aside from 58 // eliminating an indirect call in the hot I/O path, this makes 59 // MemoryManager.asioEnabled() a leaf function, allowing it to be inlined. 60 // 61 // haveASIO is immutable. 62 haveASIO bool `state:"nosave"` 63 64 // layout is the memory layout. 65 // 66 // layout is set by the binary loader before the MemoryManager can be used. 67 layout arch.MmapLayout 68 69 // privateRefs stores reference counts for private memory (memory whose 70 // ownership is shared by one or more pmas instead of being owned by a 71 // memmap.Mappable). 72 // 73 // privateRefs is immutable. 74 privateRefs *privateRefs 75 76 // users is the number of dependencies on the mappings in the MemoryManager. 77 // When the number of references in users reaches zero, all mappings are 78 // unmapped. 79 // 80 // users is accessed using atomic memory operations. 81 users int32 82 83 // mappingMu is analogous to Linux's struct mm_struct::mmap_sem. 84 mappingMu sync.RWMutex `state:"nosave"` 85 86 // vmas stores virtual memory areas. Since vmas are stored by value, 87 // clients should usually use vmaIterator.ValuePtr() instead of 88 // vmaIterator.Value() to get a pointer to the vma rather than a copy. 89 // 90 // Invariants: vmas are always page-aligned. 91 // 92 // vmas is protected by mappingMu. 93 vmas vmaSet 94 95 // brk is the mm's brk, which is manipulated using the brk(2) system call. 96 // The brk is initially set up by the loader which maps an executable 97 // binary into the mm. 98 // 99 // brk is protected by mappingMu. 100 brk hostarch.AddrRange 101 102 // usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks. 103 // 104 // usageAS is protected by mappingMu. 105 usageAS uint64 106 107 // lockedAS is the combined size in bytes of all vmas with vma.mlockMode != 108 // memmap.MLockNone. 109 // 110 // lockedAS is protected by mappingMu. 111 lockedAS uint64 112 113 // dataAS is the size of private data segments, like mm_struct->data_vm. 114 // It means the vma which is private, writable, not stack. 115 // 116 // dataAS is protected by mappingMu. 117 dataAS uint64 118 119 // New VMAs created by MMap use whichever of memmap.MMapOpts.MLockMode or 120 // defMLockMode is greater. 121 // 122 // defMLockMode is protected by mappingMu. 123 defMLockMode memmap.MLockMode 124 125 // activeMu is loosely analogous to Linux's struct 126 // mm_struct::page_table_lock. 127 activeMu sync.RWMutex `state:"nosave"` 128 129 // pmas stores platform mapping areas used to implement vmas. Since pmas 130 // are stored by value, clients should usually use pmaIterator.ValuePtr() 131 // instead of pmaIterator.Value() to get a pointer to the pma rather than 132 // a copy. 133 // 134 // Inserting or removing segments from pmas should happen along with a 135 // call to mm.insertRSS or mm.removeRSS. 136 // 137 // Invariants: pmas are always page-aligned. If a pma exists for a given 138 // address, a vma must also exist for that address. 139 // 140 // pmas is protected by activeMu. 141 pmas pmaSet 142 143 // curRSS is pmas.Span(), cached to accelerate updates to maxRSS. It is 144 // reported as the MemoryManager's RSS. 145 // 146 // maxRSS should be modified only via insertRSS and removeRSS, not 147 // directly. 148 // 149 // maxRSS is protected by activeMu. 150 curRSS uint64 151 152 // maxRSS is the maximum resident set size in bytes of a MemoryManager. 153 // It is tracked as the application adds and removes mappings to pmas. 154 // 155 // maxRSS should be modified only via insertRSS, not directly. 156 // 157 // maxRSS is protected by activeMu. 158 maxRSS uint64 159 160 // as is the platform.AddressSpace that pmas are mapped into. active is the 161 // number of contexts that require as to be non-nil; if active == 0, as may 162 // be nil. 163 // 164 // as is protected by activeMu. active is manipulated with atomic memory 165 // operations; transitions to and from zero are additionally protected by 166 // activeMu. (This is because such transitions may need to be atomic with 167 // changes to as.) 168 as platform.AddressSpace `state:"nosave"` 169 active int32 `state:"zerovalue"` 170 171 // unmapAllOnActivate indicates that the next Activate call should activate 172 // an empty AddressSpace. 173 // 174 // This is used to ensure that an AddressSpace cached in 175 // NewAddressSpace is not used after some change in the MemoryManager 176 // or VMAs has made that AddressSpace stale. 177 // 178 // unmapAllOnActivate is protected by activeMu. It must only be set when 179 // there is no active or cached AddressSpace. If as != nil, then 180 // invalidations should be propagated immediately. 181 unmapAllOnActivate bool `state:"nosave"` 182 183 // If captureInvalidations is true, calls to MM.Invalidate() are recorded 184 // in capturedInvalidations rather than being applied immediately to pmas. 185 // This is to avoid a race condition in MM.Fork(); see that function for 186 // details. 187 // 188 // Both captureInvalidations and capturedInvalidations are protected by 189 // activeMu. Neither need to be saved since captureInvalidations is only 190 // enabled during MM.Fork(), during which saving can't occur. 191 captureInvalidations bool `state:"zerovalue"` 192 capturedInvalidations []invalidateArgs `state:"nosave"` 193 194 metadataMu sync.Mutex `state:"nosave"` 195 196 // argv is the application argv. This is set up by the loader and may be 197 // modified by prctl(PR_SET_MM_ARG_START/PR_SET_MM_ARG_END). No 198 // requirements apply to argv; we do not require that argv.WellFormed(). 199 // 200 // argv is protected by metadataMu. 201 argv hostarch.AddrRange 202 203 // envv is the application envv. This is set up by the loader and may be 204 // modified by prctl(PR_SET_MM_ENV_START/PR_SET_MM_ENV_END). No 205 // requirements apply to envv; we do not require that envv.WellFormed(). 206 // 207 // envv is protected by metadataMu. 208 envv hostarch.AddrRange 209 210 // auxv is the ELF's auxiliary vector. 211 // 212 // auxv is protected by metadataMu. 213 auxv arch.Auxv 214 215 // executable is the executable for this MemoryManager. If executable 216 // is not nil, it holds a reference on the Dirent. 217 // 218 // executable is protected by metadataMu. 219 executable fsbridge.File 220 221 // dumpability describes if and how this MemoryManager may be dumped to 222 // userspace. 223 // 224 // dumpability is protected by metadataMu. 225 dumpability Dumpability 226 227 // aioManager keeps track of AIOContexts used for async IOs. AIOManager 228 // must be cloned when CLONE_VM is used. 229 aioManager aioManager 230 231 // sleepForActivation indicates whether the task should report to be sleeping 232 // before trying to activate the address space. When set to true, delays in 233 // activation are not reported as stuck tasks by the watchdog. 234 sleepForActivation bool 235 236 // vdsoSigReturnAddr is the address of 'vdso_sigreturn'. 237 vdsoSigReturnAddr uint64 238 239 // membarrierPrivateEnabled is non-zero if EnableMembarrierPrivate has 240 // previously been called. Since, as of this writing, 241 // MEMBARRIER_CMD_PRIVATE_EXPEDITED is implemented as a global memory 242 // barrier, membarrierPrivateEnabled has no other effect. 243 // 244 // membarrierPrivateEnabled is accessed using atomic memory operations. 245 membarrierPrivateEnabled uint32 246 247 // membarrierRSeqEnabled is non-zero if EnableMembarrierRSeq has previously 248 // been called. 249 // 250 // membarrierRSeqEnabled is accessed using atomic memory operations. 251 membarrierRSeqEnabled uint32 252 } 253 254 // vma represents a virtual memory area. 255 // 256 // +stateify savable 257 type vma struct { 258 // mappable is the virtual memory object mapped by this vma. If mappable is 259 // nil, the vma represents an anonymous mapping. 260 mappable memmap.Mappable 261 262 // off is the offset into mappable at which this vma begins. If mappable is 263 // nil, off is meaningless. 264 off uint64 265 266 // To speedup VMA save/restore, we group and save the following booleans 267 // as a single integer. 268 269 // realPerms are the memory permissions on this vma, as defined by the 270 // application. 271 realPerms hostarch.AccessType `state:".(int)"` 272 273 // effectivePerms are the memory permissions on this vma which are 274 // actually used to control access. 275 // 276 // Invariant: effectivePerms == realPerms.Effective(). 277 effectivePerms hostarch.AccessType `state:"manual"` 278 279 // maxPerms limits the set of permissions that may ever apply to this 280 // memory, as well as accesses for which usermem.IOOpts.IgnorePermissions 281 // is true (e.g. ptrace(PTRACE_POKEDATA)). 282 // 283 // Invariant: maxPerms == maxPerms.Effective(). 284 maxPerms hostarch.AccessType `state:"manual"` 285 286 // private is true if this is a MAP_PRIVATE mapping, such that writes to 287 // the mapping are propagated to a copy. 288 private bool `state:"manual"` 289 290 // growsDown is true if the mapping may be automatically extended downward 291 // under certain conditions. If growsDown is true, mappable must be nil. 292 // 293 // There is currently no corresponding growsUp flag; in Linux, the only 294 // architectures that can have VM_GROWSUP mappings are ia64, parisc, and 295 // metag, none of which we currently support. 296 growsDown bool `state:"manual"` 297 298 // dontfork is the MADV_DONTFORK setting for this vma configured by madvise(). 299 dontfork bool 300 301 mlockMode memmap.MLockMode 302 303 // numaPolicy is the NUMA policy for this vma set by mbind(). 304 numaPolicy linux.NumaPolicy 305 306 // numaNodemask is the NUMA nodemask for this vma set by mbind(). 307 numaNodemask uint64 308 309 // If id is not nil, it controls the lifecycle of mappable and provides vma 310 // metadata shown in /proc/[pid]/maps, and the vma holds a reference. 311 id memmap.MappingIdentity 312 313 // If hint is non-empty, it is a description of the vma printed in 314 // /proc/[pid]/maps. hint takes priority over id.MappedName(). 315 hint string 316 } 317 318 const ( 319 vmaRealPermsRead = 1 << iota 320 vmaRealPermsWrite 321 vmaRealPermsExecute 322 vmaEffectivePermsRead 323 vmaEffectivePermsWrite 324 vmaEffectivePermsExecute 325 vmaMaxPermsRead 326 vmaMaxPermsWrite 327 vmaMaxPermsExecute 328 vmaPrivate 329 vmaGrowsDown 330 ) 331 332 func (v *vma) saveRealPerms() int { 333 var b int 334 if v.realPerms.Read { 335 b |= vmaRealPermsRead 336 } 337 if v.realPerms.Write { 338 b |= vmaRealPermsWrite 339 } 340 if v.realPerms.Execute { 341 b |= vmaRealPermsExecute 342 } 343 if v.effectivePerms.Read { 344 b |= vmaEffectivePermsRead 345 } 346 if v.effectivePerms.Write { 347 b |= vmaEffectivePermsWrite 348 } 349 if v.effectivePerms.Execute { 350 b |= vmaEffectivePermsExecute 351 } 352 if v.maxPerms.Read { 353 b |= vmaMaxPermsRead 354 } 355 if v.maxPerms.Write { 356 b |= vmaMaxPermsWrite 357 } 358 if v.maxPerms.Execute { 359 b |= vmaMaxPermsExecute 360 } 361 if v.private { 362 b |= vmaPrivate 363 } 364 if v.growsDown { 365 b |= vmaGrowsDown 366 } 367 return b 368 } 369 370 func (v *vma) loadRealPerms(b int) { 371 if b&vmaRealPermsRead > 0 { 372 v.realPerms.Read = true 373 } 374 if b&vmaRealPermsWrite > 0 { 375 v.realPerms.Write = true 376 } 377 if b&vmaRealPermsExecute > 0 { 378 v.realPerms.Execute = true 379 } 380 if b&vmaEffectivePermsRead > 0 { 381 v.effectivePerms.Read = true 382 } 383 if b&vmaEffectivePermsWrite > 0 { 384 v.effectivePerms.Write = true 385 } 386 if b&vmaEffectivePermsExecute > 0 { 387 v.effectivePerms.Execute = true 388 } 389 if b&vmaMaxPermsRead > 0 { 390 v.maxPerms.Read = true 391 } 392 if b&vmaMaxPermsWrite > 0 { 393 v.maxPerms.Write = true 394 } 395 if b&vmaMaxPermsExecute > 0 { 396 v.maxPerms.Execute = true 397 } 398 if b&vmaPrivate > 0 { 399 v.private = true 400 } 401 if b&vmaGrowsDown > 0 { 402 v.growsDown = true 403 } 404 } 405 406 // pma represents a platform mapping area. 407 // 408 // +stateify savable 409 type pma struct { 410 // file is the file mapped by this pma. Only pmas for which file == 411 // MemoryManager.mfp.MemoryFile() may be saved. pmas hold a reference to 412 // the corresponding file range while they exist. 413 file memmap.File `state:"nosave"` 414 415 // off is the offset into file at which this pma begins. 416 // 417 // Note that pmas do *not* hold references on offsets in file! If private 418 // is true, MemoryManager.privateRefs holds the reference instead. If 419 // private is false, the corresponding memmap.Mappable holds the reference 420 // instead (per memmap.Mappable.Translate requirement). 421 off uint64 422 423 // translatePerms is the permissions returned by memmap.Mappable.Translate. 424 // If private is true, translatePerms is hostarch.AnyAccess. 425 translatePerms hostarch.AccessType 426 427 // effectivePerms is the permissions allowed for non-ignorePermissions 428 // accesses. maxPerms is the permissions allowed for ignorePermissions 429 // accesses. These are vma.effectivePerms and vma.maxPerms respectively, 430 // masked by pma.translatePerms and with Write disallowed if pma.needCOW is 431 // true. 432 // 433 // These are stored in the pma so that the IO implementation can avoid 434 // iterating mm.vmas when pmas already exist. 435 effectivePerms hostarch.AccessType 436 maxPerms hostarch.AccessType 437 438 // needCOW is true if writes to the mapping must be propagated to a copy. 439 needCOW bool 440 441 // private is true if this pma represents private memory. 442 // 443 // If private is true, file must be MemoryManager.mfp.MemoryFile(), the pma 444 // holds a reference on the mapped memory that is tracked in privateRefs, 445 // and calls to Invalidate for which 446 // memmap.InvalidateOpts.InvalidatePrivate is false should ignore the pma. 447 // 448 // If private is false, this pma caches a translation from the 449 // corresponding vma's memmap.Mappable.Translate. 450 private bool 451 452 // If internalMappings is not empty, it is the cached return value of 453 // file.MapInternal for the memmap.FileRange mapped by this pma. 454 internalMappings safemem.BlockSeq `state:"nosave"` 455 } 456 457 // +stateify savable 458 type privateRefs struct { 459 mu sync.Mutex `state:"nosave"` 460 461 // refs maps offsets into MemoryManager.mfp.MemoryFile() to the number of 462 // pmas (or, equivalently, MemoryManagers) that share ownership of the 463 // memory at that offset. 464 refs fileRefcountSet 465 } 466 467 type invalidateArgs struct { 468 ar hostarch.AddrRange 469 opts memmap.InvalidateOpts 470 } 471 472 // fileRefcountSetFunctions implements segment.Functions for fileRefcountSet. 473 type fileRefcountSetFunctions struct{} 474 475 func (fileRefcountSetFunctions) MinKey() uint64 { 476 return 0 477 } 478 479 func (fileRefcountSetFunctions) MaxKey() uint64 { 480 return ^uint64(0) 481 } 482 483 func (fileRefcountSetFunctions) ClearValue(_ *int32) { 484 } 485 486 func (fileRefcountSetFunctions) Merge(_ memmap.FileRange, rc1 int32, _ memmap.FileRange, rc2 int32) (int32, bool) { 487 return rc1, rc1 == rc2 488 } 489 490 func (fileRefcountSetFunctions) Split(_ memmap.FileRange, rc int32, _ uint64) (int32, int32) { 491 return rc, rc 492 }