github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/mm/mm.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package mm provides a memory management subsystem. See README.md for a 16 // detailed overview. 17 // 18 // Lock order: 19 // 20 // fs locks, except for memmap.Mappable locks 21 // mm.MemoryManager.metadataMu 22 // mm.MemoryManager.mappingMu 23 // Locks taken by memmap.MappingIdentity and memmap.Mappable methods other 24 // than Translate 25 // kernel.TaskSet.mu 26 // mm.MemoryManager.activeMu 27 // Locks taken by memmap.Mappable.Translate 28 // mm.privateRefs.mu 29 // platform.AddressSpace locks 30 // memmap.File locks 31 // mm.aioManager.mu 32 // mm.AIOContext.mu 33 // 34 // Only mm.MemoryManager.Fork is permitted to lock mm.MemoryManager.activeMu in 35 // multiple mm.MemoryManagers, as it does so in a well-defined order (forked 36 // child first). 37 package mm 38 39 import ( 40 "sync/atomic" 41 42 "github.com/nicocha30/gvisor-ligolo/pkg/abi/linux" 43 "github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops" 44 "github.com/nicocha30/gvisor-ligolo/pkg/hostarch" 45 "github.com/nicocha30/gvisor-ligolo/pkg/safemem" 46 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/arch" 47 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/memmap" 48 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/pgalloc" 49 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/platform" 50 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs" 51 ) 52 53 // MapsCallbackFunc has all the parameters required for populating an entry of /proc/[pid]/maps. 54 type MapsCallbackFunc func(start, end hostarch.Addr, permissions hostarch.AccessType, private string, offset uint64, devMajor, devMinor uint32, inode uint64, path string) 55 56 // MemoryManager implements a virtual address space. 57 // 58 // +stateify savable 59 type MemoryManager struct { 60 // p and mfp are immutable. 61 p platform.Platform 62 mfp pgalloc.MemoryFileProvider 63 64 // haveASIO is the cached result of p.SupportsAddressSpaceIO(). Aside from 65 // eliminating an indirect call in the hot I/O path, this makes 66 // MemoryManager.asioEnabled() a leaf function, allowing it to be inlined. 67 // 68 // haveASIO is immutable. 69 haveASIO bool `state:"nosave"` 70 71 // layout is the memory layout. 72 // 73 // layout is set by the binary loader before the MemoryManager can be used. 74 layout arch.MmapLayout 75 76 // privateRefs stores reference counts for private memory (memory whose 77 // ownership is shared by one or more pmas instead of being owned by a 78 // memmap.Mappable). 79 // 80 // privateRefs is immutable. 81 privateRefs *privateRefs 82 83 // users is the number of dependencies on the mappings in the MemoryManager. 84 // When the number of references in users reaches zero, all mappings are 85 // unmapped. 86 users atomicbitops.Int32 87 88 // mappingMu is analogous to Linux's struct mm_struct::mmap_sem. 89 mappingMu mappingRWMutex `state:"nosave"` 90 91 // vmas stores virtual memory areas. Since vmas are stored by value, 92 // clients should usually use vmaIterator.ValuePtr() instead of 93 // vmaIterator.Value() to get a pointer to the vma rather than a copy. 94 // 95 // Invariants: vmas are always page-aligned. 96 // 97 // vmas is protected by mappingMu. 98 vmas vmaSet 99 100 // brk is the mm's brk, which is manipulated using the brk(2) system call. 101 // The brk is initially set up by the loader which maps an executable 102 // binary into the mm. 103 // 104 // brk is protected by mappingMu. 105 brk hostarch.AddrRange 106 107 // usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks. 108 // 109 // usageAS is protected by mappingMu. 110 usageAS uint64 111 112 // lockedAS is the combined size in bytes of all vmas with vma.mlockMode != 113 // memmap.MLockNone. 114 // 115 // lockedAS is protected by mappingMu. 116 lockedAS uint64 117 118 // dataAS is the size of private data segments, like mm_struct->data_vm. 119 // It means the vma which is private, writable, not stack. 120 // 121 // dataAS is protected by mappingMu. 122 dataAS uint64 123 124 // New VMAs created by MMap use whichever of memmap.MMapOpts.MLockMode or 125 // defMLockMode is greater. 126 // 127 // defMLockMode is protected by mappingMu. 128 defMLockMode memmap.MLockMode 129 130 // activeMu is loosely analogous to Linux's struct 131 // mm_struct::page_table_lock. 132 activeMu activeRWMutex `state:"nosave"` 133 134 // pmas stores platform mapping areas used to implement vmas. Since pmas 135 // are stored by value, clients should usually use pmaIterator.ValuePtr() 136 // instead of pmaIterator.Value() to get a pointer to the pma rather than 137 // a copy. 138 // 139 // Inserting or removing segments from pmas should happen along with a 140 // call to mm.insertRSS or mm.removeRSS. 141 // 142 // Invariants: pmas are always page-aligned. If a pma exists for a given 143 // address, a vma must also exist for that address. 144 // 145 // pmas is protected by activeMu. 146 pmas pmaSet 147 148 // curRSS is pmas.Span(), cached to accelerate updates to maxRSS. It is 149 // reported as the MemoryManager's RSS. 150 // 151 // maxRSS should be modified only via insertRSS and removeRSS, not 152 // directly. 153 // 154 // maxRSS is protected by activeMu. 155 curRSS uint64 156 157 // maxRSS is the maximum resident set size in bytes of a MemoryManager. 158 // It is tracked as the application adds and removes mappings to pmas. 159 // 160 // maxRSS should be modified only via insertRSS, not directly. 161 // 162 // maxRSS is protected by activeMu. 163 maxRSS uint64 164 165 // as is the platform.AddressSpace that pmas are mapped into. active is the 166 // number of contexts that require as to be non-nil; if active == 0, as may 167 // be nil. 168 // 169 // as is protected by activeMu. active is manipulated with atomic memory 170 // operations; transitions to and from zero are additionally protected by 171 // activeMu. (This is because such transitions may need to be atomic with 172 // changes to as.) 173 as platform.AddressSpace `state:"nosave"` 174 active atomicbitops.Int32 `state:"zerovalue"` 175 176 // unmapAllOnActivate indicates that the next Activate call should activate 177 // an empty AddressSpace. 178 // 179 // This is used to ensure that an AddressSpace cached in 180 // NewAddressSpace is not used after some change in the MemoryManager 181 // or VMAs has made that AddressSpace stale. 182 // 183 // unmapAllOnActivate is protected by activeMu. It must only be set when 184 // there is no active or cached AddressSpace. If as != nil, then 185 // invalidations should be propagated immediately. 186 unmapAllOnActivate bool `state:"nosave"` 187 188 // If captureInvalidations is true, calls to MM.Invalidate() are recorded 189 // in capturedInvalidations rather than being applied immediately to pmas. 190 // This is to avoid a race condition in MM.Fork(); see that function for 191 // details. 192 // 193 // Both captureInvalidations and capturedInvalidations are protected by 194 // activeMu. Neither need to be saved since captureInvalidations is only 195 // enabled during MM.Fork(), during which saving can't occur. 196 captureInvalidations bool `state:"zerovalue"` 197 capturedInvalidations []invalidateArgs `state:"nosave"` 198 199 // dumpability describes if and how this MemoryManager may be dumped to 200 // userspace. This is read under kernel.TaskSet.mu, so it can't be protected 201 // by metadataMu. 202 dumpability atomicbitops.Int32 203 204 metadataMu metadataMutex `state:"nosave"` 205 206 // argv is the application argv. This is set up by the loader and may be 207 // modified by prctl(PR_SET_MM_ARG_START/PR_SET_MM_ARG_END). No 208 // requirements apply to argv; we do not require that argv.WellFormed(). 209 // 210 // argv is protected by metadataMu. 211 argv hostarch.AddrRange 212 213 // envv is the application envv. This is set up by the loader and may be 214 // modified by prctl(PR_SET_MM_ENV_START/PR_SET_MM_ENV_END). No 215 // requirements apply to envv; we do not require that envv.WellFormed(). 216 // 217 // envv is protected by metadataMu. 218 envv hostarch.AddrRange 219 220 // auxv is the ELF's auxiliary vector. 221 // 222 // auxv is protected by metadataMu. 223 auxv arch.Auxv 224 225 // executable is the executable for this MemoryManager. If executable 226 // is not nil, it holds a reference on the Dirent. 227 // 228 // executable is protected by metadataMu. 229 executable *vfs.FileDescription 230 231 // aioManager keeps track of AIOContexts used for async IOs. AIOManager 232 // must be cloned when CLONE_VM is used. 233 aioManager aioManager 234 235 // sleepForActivation indicates whether the task should report to be sleeping 236 // before trying to activate the address space. When set to true, delays in 237 // activation are not reported as stuck tasks by the watchdog. 238 sleepForActivation bool 239 240 // vdsoSigReturnAddr is the address of 'vdso_sigreturn'. 241 vdsoSigReturnAddr uint64 242 243 // membarrierPrivateEnabled is non-zero if EnableMembarrierPrivate has 244 // previously been called. Since, as of this writing, 245 // MEMBARRIER_CMD_PRIVATE_EXPEDITED is implemented as a global memory 246 // barrier, membarrierPrivateEnabled has no other effect. 247 membarrierPrivateEnabled atomicbitops.Uint32 248 249 // membarrierRSeqEnabled is non-zero if EnableMembarrierRSeq has previously 250 // been called. 251 membarrierRSeqEnabled atomicbitops.Uint32 252 } 253 254 // vma represents a virtual memory area. 255 // 256 // Note: new fields added to this struct must be added to vma.Copy and 257 // vmaSetFunctions.Merge. 258 // 259 // +stateify savable 260 type vma struct { 261 // mappable is the virtual memory object mapped by this vma. If mappable is 262 // nil, the vma represents an anonymous mapping. 263 mappable memmap.Mappable 264 265 // off is the offset into mappable at which this vma begins. If mappable is 266 // nil, off is meaningless. 267 off uint64 268 269 // To speedup VMA save/restore, we group and save the following booleans 270 // as a single integer. 271 272 // realPerms are the memory permissions on this vma, as defined by the 273 // application. 274 realPerms hostarch.AccessType `state:".(int)"` 275 276 // effectivePerms are the memory permissions on this vma which are 277 // actually used to control access. 278 // 279 // Invariant: effectivePerms == realPerms.Effective(). 280 effectivePerms hostarch.AccessType `state:"manual"` 281 282 // maxPerms limits the set of permissions that may ever apply to this 283 // memory, as well as accesses for which usermem.IOOpts.IgnorePermissions 284 // is true (e.g. ptrace(PTRACE_POKEDATA)). 285 // 286 // Invariant: maxPerms == maxPerms.Effective(). 287 maxPerms hostarch.AccessType `state:"manual"` 288 289 // private is true if this is a MAP_PRIVATE mapping, such that writes to 290 // the mapping are propagated to a copy. 291 private bool `state:"manual"` 292 293 // growsDown is true if the mapping may be automatically extended downward 294 // under certain conditions. If growsDown is true, mappable must be nil. 295 // 296 // There is currently no corresponding growsUp flag; in Linux, the only 297 // architectures that can have VM_GROWSUP mappings are ia64, parisc, and 298 // metag, none of which we currently support. 299 growsDown bool `state:"manual"` 300 301 // dontfork is the MADV_DONTFORK setting for this vma configured by madvise(). 302 dontfork bool 303 304 mlockMode memmap.MLockMode 305 306 // numaPolicy is the NUMA policy for this vma set by mbind(). 307 numaPolicy linux.NumaPolicy 308 309 // numaNodemask is the NUMA nodemask for this vma set by mbind(). 310 numaNodemask uint64 311 312 // If id is not nil, it controls the lifecycle of mappable and provides vma 313 // metadata shown in /proc/[pid]/maps, and the vma holds a reference. 314 id memmap.MappingIdentity 315 316 // If hint is non-empty, it is a description of the vma printed in 317 // /proc/[pid]/maps. hint takes priority over id.MappedName(). 318 hint string 319 320 // lastFault records the last address that was paged faulted. It hints at 321 // which direction addresses in this vma are being accessed. 322 // 323 // This field can be read atomically, and written with mm.activeMu locked for 324 // writing and mm.mapping locked. 325 lastFault uintptr 326 } 327 328 const ( 329 vmaRealPermsRead = 1 << iota 330 vmaRealPermsWrite 331 vmaRealPermsExecute 332 vmaEffectivePermsRead 333 vmaEffectivePermsWrite 334 vmaEffectivePermsExecute 335 vmaMaxPermsRead 336 vmaMaxPermsWrite 337 vmaMaxPermsExecute 338 vmaPrivate 339 vmaGrowsDown 340 ) 341 342 func (v *vma) saveRealPerms() int { 343 var b int 344 if v.realPerms.Read { 345 b |= vmaRealPermsRead 346 } 347 if v.realPerms.Write { 348 b |= vmaRealPermsWrite 349 } 350 if v.realPerms.Execute { 351 b |= vmaRealPermsExecute 352 } 353 if v.effectivePerms.Read { 354 b |= vmaEffectivePermsRead 355 } 356 if v.effectivePerms.Write { 357 b |= vmaEffectivePermsWrite 358 } 359 if v.effectivePerms.Execute { 360 b |= vmaEffectivePermsExecute 361 } 362 if v.maxPerms.Read { 363 b |= vmaMaxPermsRead 364 } 365 if v.maxPerms.Write { 366 b |= vmaMaxPermsWrite 367 } 368 if v.maxPerms.Execute { 369 b |= vmaMaxPermsExecute 370 } 371 if v.private { 372 b |= vmaPrivate 373 } 374 if v.growsDown { 375 b |= vmaGrowsDown 376 } 377 return b 378 } 379 380 func (v *vma) loadRealPerms(b int) { 381 if b&vmaRealPermsRead > 0 { 382 v.realPerms.Read = true 383 } 384 if b&vmaRealPermsWrite > 0 { 385 v.realPerms.Write = true 386 } 387 if b&vmaRealPermsExecute > 0 { 388 v.realPerms.Execute = true 389 } 390 if b&vmaEffectivePermsRead > 0 { 391 v.effectivePerms.Read = true 392 } 393 if b&vmaEffectivePermsWrite > 0 { 394 v.effectivePerms.Write = true 395 } 396 if b&vmaEffectivePermsExecute > 0 { 397 v.effectivePerms.Execute = true 398 } 399 if b&vmaMaxPermsRead > 0 { 400 v.maxPerms.Read = true 401 } 402 if b&vmaMaxPermsWrite > 0 { 403 v.maxPerms.Write = true 404 } 405 if b&vmaMaxPermsExecute > 0 { 406 v.maxPerms.Execute = true 407 } 408 if b&vmaPrivate > 0 { 409 v.private = true 410 } 411 if b&vmaGrowsDown > 0 { 412 v.growsDown = true 413 } 414 } 415 416 func (v *vma) copy() vma { 417 return vma{ 418 mappable: v.mappable, 419 off: v.off, 420 realPerms: v.realPerms, 421 effectivePerms: v.effectivePerms, 422 maxPerms: v.maxPerms, 423 private: v.private, 424 growsDown: v.growsDown, 425 dontfork: v.dontfork, 426 mlockMode: v.mlockMode, 427 numaPolicy: v.numaPolicy, 428 numaNodemask: v.numaNodemask, 429 id: v.id, 430 hint: v.hint, 431 lastFault: atomic.LoadUintptr(&v.lastFault), 432 } 433 } 434 435 // pma represents a platform mapping area. 436 // 437 // +stateify savable 438 type pma struct { 439 // file is the file mapped by this pma. Only pmas for which file == 440 // MemoryManager.mfp.MemoryFile() may be saved. pmas hold a reference to 441 // the corresponding file range while they exist. 442 file memmap.File `state:"nosave"` 443 444 // off is the offset into file at which this pma begins. 445 // 446 // Note that pmas do *not* hold references on offsets in file! If private 447 // is true, MemoryManager.privateRefs holds the reference instead. If 448 // private is false, the corresponding memmap.Mappable holds the reference 449 // instead (per memmap.Mappable.Translate requirement). 450 off uint64 451 452 // translatePerms is the permissions returned by memmap.Mappable.Translate. 453 // If private is true, translatePerms is hostarch.AnyAccess. 454 translatePerms hostarch.AccessType 455 456 // effectivePerms is the permissions allowed for non-ignorePermissions 457 // accesses. maxPerms is the permissions allowed for ignorePermissions 458 // accesses. These are vma.effectivePerms and vma.maxPerms respectively, 459 // masked by pma.translatePerms and with Write disallowed if pma.needCOW is 460 // true. 461 // 462 // These are stored in the pma so that the IO implementation can avoid 463 // iterating mm.vmas when pmas already exist. 464 effectivePerms hostarch.AccessType 465 maxPerms hostarch.AccessType 466 467 // needCOW is true if writes to the mapping must be propagated to a copy. 468 needCOW bool 469 470 // private is true if this pma represents private memory. 471 // 472 // If private is true, file must be MemoryManager.mfp.MemoryFile(), the pma 473 // holds a reference on the mapped memory that is tracked in privateRefs, 474 // and calls to Invalidate for which 475 // memmap.InvalidateOpts.InvalidatePrivate is false should ignore the pma. 476 // 477 // If private is false, this pma caches a translation from the 478 // corresponding vma's memmap.Mappable.Translate. 479 private bool 480 481 // If internalMappings is not empty, it is the cached return value of 482 // file.MapInternal for the memmap.FileRange mapped by this pma. 483 internalMappings safemem.BlockSeq `state:"nosave"` 484 } 485 486 // +stateify savable 487 type privateRefs struct { 488 mu privateRefsMutex `state:"nosave"` 489 490 // refs maps offsets into MemoryManager.mfp.MemoryFile() to the number of 491 // pmas (or, equivalently, MemoryManagers) that share ownership of the 492 // memory at that offset. 493 refs fileRefcountSet 494 } 495 496 type invalidateArgs struct { 497 ar hostarch.AddrRange 498 opts memmap.InvalidateOpts 499 } 500 501 // fileRefcountSetFunctions implements segment.Functions for fileRefcountSet. 502 type fileRefcountSetFunctions struct{} 503 504 func (fileRefcountSetFunctions) MinKey() uint64 { 505 return 0 506 } 507 508 func (fileRefcountSetFunctions) MaxKey() uint64 { 509 return ^uint64(0) 510 } 511 512 func (fileRefcountSetFunctions) ClearValue(_ *int32) { 513 } 514 515 func (fileRefcountSetFunctions) Merge(_ memmap.FileRange, rc1 int32, _ memmap.FileRange, rc2 int32) (int32, bool) { 516 return rc1, rc1 == rc2 517 } 518 519 func (fileRefcountSetFunctions) Split(_ memmap.FileRange, rc int32, _ uint64) (int32, int32) { 520 return rc, rc 521 }