gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/mm/pma.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package mm 16 17 import ( 18 "fmt" 19 "sync" 20 "sync/atomic" 21 22 "gvisor.dev/gvisor/pkg/context" 23 "gvisor.dev/gvisor/pkg/errors/linuxerr" 24 "gvisor.dev/gvisor/pkg/hostarch" 25 "gvisor.dev/gvisor/pkg/safecopy" 26 "gvisor.dev/gvisor/pkg/safemem" 27 "gvisor.dev/gvisor/pkg/sentry/memmap" 28 "gvisor.dev/gvisor/pkg/sentry/pgalloc" 29 "gvisor.dev/gvisor/pkg/sentry/usage" 30 ) 31 32 // existingPMAsLocked checks that pmas exist for all addresses in ar, and 33 // support access of type (at, ignorePermissions). If so, it returns an 34 // iterator to the pma containing ar.Start. Otherwise it returns a terminal 35 // iterator. 36 // 37 // Preconditions: 38 // - mm.activeMu must be locked. 39 // - ar.Length() != 0. 40 func (mm *MemoryManager) existingPMAsLocked(ar hostarch.AddrRange, at hostarch.AccessType, ignorePermissions bool, needInternalMappings bool) pmaIterator { 41 if checkInvariants { 42 if !ar.WellFormed() || ar.Length() == 0 { 43 panic(fmt.Sprintf("invalid ar: %v", ar)) 44 } 45 } 46 47 first := mm.pmas.FindSegment(ar.Start) 48 pseg := first 49 for pseg.Ok() { 50 pma := pseg.ValuePtr() 51 perms := pma.effectivePerms 52 if ignorePermissions { 53 perms = pma.maxPerms 54 } 55 if !perms.SupersetOf(at) { 56 return pmaIterator{} 57 } 58 if needInternalMappings && pma.internalMappings.IsEmpty() { 59 return pmaIterator{} 60 } 61 62 if ar.End <= pseg.End() { 63 return first 64 } 65 pseg, _ = pseg.NextNonEmpty() 66 } 67 68 // Ran out of pmas before reaching ar.End. 69 return pmaIterator{} 70 } 71 72 // existingVecPMAsLocked returns true if pmas exist for all addresses in ars, 73 // and support access of type (at, ignorePermissions). 74 // 75 // Preconditions: mm.activeMu must be locked. 76 func (mm *MemoryManager) existingVecPMAsLocked(ars hostarch.AddrRangeSeq, at hostarch.AccessType, ignorePermissions bool, needInternalMappings bool) bool { 77 for ; !ars.IsEmpty(); ars = ars.Tail() { 78 if ar := ars.Head(); ar.Length() != 0 && !mm.existingPMAsLocked(ar, at, ignorePermissions, needInternalMappings).Ok() { 79 return false 80 } 81 } 82 return true 83 } 84 85 // getPMAsLocked ensures that pmas exist for all addresses in ar, and support 86 // access of type at. It returns: 87 // 88 // - An iterator to the pma containing ar.Start. If no pma contains ar.Start, 89 // the iterator is unspecified. 90 // 91 // - An iterator to the gap after the last pma containing an address in ar. If 92 // pmas exist for no addresses in ar, the iterator is to a gap that begins 93 // before ar.Start. 94 // 95 // - An error that is non-nil if pmas exist for only a subset of ar. 96 // 97 // Preconditions: 98 // - mm.mappingMu must be locked. 99 // - mm.activeMu must be locked for writing. 100 // - ar.Length() != 0. 101 // - vseg.Range().Contains(ar.Start). 102 // - vmas must exist for all addresses in ar, and support accesses of type at 103 // (i.e. permission checks must have been performed against vmas). 104 func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, at hostarch.AccessType) (pmaIterator, pmaGapIterator, error) { 105 if checkInvariants { 106 if !ar.WellFormed() || ar.Length() == 0 { 107 panic(fmt.Sprintf("invalid ar: %v", ar)) 108 } 109 if !vseg.Ok() { 110 panic("terminal vma iterator") 111 } 112 if !vseg.Range().Contains(ar.Start) { 113 panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar)) 114 } 115 } 116 117 // Page-align ar so that all AddrRanges are aligned. 118 end, ok := ar.End.RoundUp() 119 var alignerr error 120 if !ok { 121 end = ar.End.RoundDown() 122 alignerr = linuxerr.EFAULT 123 } 124 ar = hostarch.AddrRange{ar.Start.RoundDown(), end} 125 126 pstart, pend, perr := mm.getPMAsInternalLocked(ctx, vseg, ar, at) 127 if pend.Start() <= ar.Start { 128 return pmaIterator{}, pend, perr 129 } 130 // getPMAsInternalLocked may not have returned pstart due to iterator 131 // invalidation. 132 if !pstart.Ok() { 133 pstart = mm.findOrSeekPrevUpperBoundPMA(ar.Start, pend) 134 } 135 if perr != nil { 136 return pstart, pend, perr 137 } 138 return pstart, pend, alignerr 139 } 140 141 // getVecPMAsLocked ensures that pmas exist for all addresses in ars, and 142 // support access of type at. It returns the subset of ars for which pmas 143 // exist. If this is not equal to ars, it returns a non-nil error explaining 144 // why. 145 // 146 // Preconditions: 147 // - mm.mappingMu must be locked. 148 // - mm.activeMu must be locked for writing. 149 // - vmas must exist for all addresses in ars, and support accesses of type at 150 // (i.e. permission checks must have been performed against vmas). 151 func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars hostarch.AddrRangeSeq, at hostarch.AccessType) (hostarch.AddrRangeSeq, error) { 152 for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() { 153 ar := arsit.Head() 154 if ar.Length() == 0 { 155 continue 156 } 157 if checkInvariants { 158 if !ar.WellFormed() { 159 panic(fmt.Sprintf("invalid ar: %v", ar)) 160 } 161 } 162 163 // Page-align ar so that all AddrRanges are aligned. 164 end, ok := ar.End.RoundUp() 165 var alignerr error 166 if !ok { 167 end = ar.End.RoundDown() 168 alignerr = linuxerr.EFAULT 169 } 170 ar = hostarch.AddrRange{ar.Start.RoundDown(), end} 171 172 _, pend, perr := mm.getPMAsInternalLocked(ctx, mm.vmas.FindSegment(ar.Start), ar, at) 173 if perr != nil { 174 return truncatedAddrRangeSeq(ars, arsit, pend.Start()), perr 175 } 176 if alignerr != nil { 177 return truncatedAddrRangeSeq(ars, arsit, pend.Start()), alignerr 178 } 179 } 180 181 return ars, nil 182 } 183 184 // getPMAsInternalLocked is equivalent to getPMAsLocked, with the following 185 // exceptions: 186 // 187 // - getPMAsInternalLocked returns a pmaIterator on a best-effort basis (that 188 // is, the returned iterator may be terminal, even if a pma that contains 189 // ar.Start exists). Returning this iterator on a best-effort basis allows 190 // callers that require it to use it when it's cheaply available, while also 191 // avoiding the overhead of retrieving it when it's not. 192 // 193 // - getPMAsInternalLocked additionally requires that ar is page-aligned. 194 // getPMAsInternalLocked is an implementation helper for getPMAsLocked and 195 // getVecPMAsLocked; other clients should call one of those instead. 196 func (mm *MemoryManager) getPMAsInternalLocked(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, at hostarch.AccessType) (pmaIterator, pmaGapIterator, error) { 197 if checkInvariants { 198 if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { 199 panic(fmt.Sprintf("invalid ar: %v", ar)) 200 } 201 if !vseg.Ok() { 202 panic("terminal vma iterator") 203 } 204 if !vseg.Range().Contains(ar.Start) { 205 panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar)) 206 } 207 } 208 var pfdrs *pendingFileDecRefs 209 defer func() { // must be a closure to avoid evaluating pfdrs immediately 210 pfdrs.Cleanup() 211 }() 212 var unmapAR hostarch.AddrRange 213 defer func() { 214 mm.unmapASLocked(unmapAR) 215 }() 216 217 memCgID := pgalloc.MemoryCgroupIDFromContext(ctx) 218 opts := pgalloc.AllocOpts{Kind: usage.Anonymous, Dir: pgalloc.BottomUp, MemCgID: memCgID} 219 vma := vseg.ValuePtr() 220 if uintptr(ar.Start) < atomic.LoadUintptr(&vma.lastFault) { 221 // Detect cases where memory is accessed downwards and change memory file 222 // allocation order to increase the chances that pages are coalesced. 223 opts.Dir = pgalloc.TopDown 224 } 225 atomic.StoreUintptr(&vma.lastFault, uintptr(ar.Start)) 226 227 // Limit the range we allocate to ar, aligned to privateAllocUnit. 228 maskAR := privateAligned(ar) 229 // The range in which we iterate vmas and pmas is still limited to ar, to 230 // ensure that we don't allocate or COW-break a pma we don't need. 231 pseg, pgap := mm.pmas.Find(ar.Start) 232 pstart := pseg 233 for { 234 // Get pmas for this vma. 235 vsegAR := vseg.Range().Intersect(ar) 236 vma := vseg.ValuePtr() 237 pmaLoop: 238 for { 239 switch { 240 case pgap.Ok() && pgap.Start() < vsegAR.End: 241 // Need a pma here. 242 optAR := vseg.Range().Intersect(pgap.Range()) 243 if checkInvariants { 244 if optAR.Length() == 0 { 245 panic(fmt.Sprintf("vseg %v and pgap %v do not overlap", vseg, pgap)) 246 } 247 } 248 if vma.mappable == nil { 249 // Private anonymous mappings get pmas by allocating. 250 allocAR := optAR.Intersect(maskAR) 251 fr, err := mm.mf.Allocate(uint64(allocAR.Length()), opts) 252 if err != nil { 253 return pstart, pgap, err 254 } 255 if checkInvariants { 256 if !fr.WellFormed() || fr.Length() != uint64(allocAR.Length()) { 257 panic(fmt.Sprintf("Allocate(%v) returned invalid FileRange %v", allocAR.Length(), fr)) 258 } 259 } 260 mm.addRSSLocked(allocAR) 261 pseg, pgap = mm.pmas.Insert(pgap, allocAR, pma{ 262 file: mm.mf, 263 off: fr.Start, 264 translatePerms: hostarch.AnyAccess, 265 effectivePerms: vma.effectivePerms, 266 maxPerms: vma.maxPerms, 267 // Since we just allocated this memory and have the 268 // only reference, the new pma does not need 269 // copy-on-write. 270 private: true, 271 }).NextNonEmpty() 272 pstart = pmaIterator{} // iterators invalidated 273 } else { 274 // Other mappings get pmas by translating. 275 optMR := vseg.mappableRangeOf(optAR) 276 reqAR := optAR.Intersect(ar) 277 reqMR := vseg.mappableRangeOf(reqAR) 278 perms := at 279 if vma.private { 280 // This pma will be copy-on-write; don't require write 281 // permission, but do require read permission to 282 // facilitate the copy. 283 // 284 // If at.Write is true, we will need to break 285 // copy-on-write immediately, which occurs after 286 // translation below. 287 perms.Read = true 288 perms.Write = false 289 } 290 ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms) 291 if checkInvariants { 292 if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil { 293 panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err)) 294 } 295 } 296 // Install a pma for each translation. 297 if len(ts) == 0 { 298 return pstart, pgap, err 299 } 300 pstart = pmaIterator{} // iterators invalidated 301 for _, t := range ts { 302 newpmaAR := vseg.addrRangeOf(t.Source) 303 newpma := pma{ 304 file: t.File, 305 off: t.Offset, 306 translatePerms: t.Perms, 307 effectivePerms: vma.effectivePerms.Intersect(t.Perms), 308 maxPerms: vma.maxPerms.Intersect(t.Perms), 309 } 310 if vma.private { 311 newpma.effectivePerms.Write = false 312 newpma.maxPerms.Write = false 313 newpma.needCOW = true 314 } 315 mm.addRSSLocked(newpmaAR) 316 t.File.IncRef(t.FileRange(), memCgID) 317 // This is valid because memmap.Mappable.Translate is 318 // required to return Translations in increasing 319 // Translation.Source order. 320 pseg = mm.pmas.Insert(pgap, newpmaAR, newpma) 321 pgap = pseg.NextGap() 322 } 323 // The error returned by Translate is only significant if 324 // it occurred before ar.End. 325 if err != nil && vseg.addrRangeOf(ts[len(ts)-1].Source).End < ar.End { 326 return pstart, pgap, err 327 } 328 // Rewind pseg to the first pma inserted and continue the 329 // loop to check if we need to break copy-on-write. 330 pseg, pgap = mm.findOrSeekPrevUpperBoundPMA(vseg.addrRangeOf(ts[0].Source).Start, pgap), pmaGapIterator{} 331 continue 332 } 333 334 case pseg.Ok() && pseg.Start() < vsegAR.End: 335 oldpma := pseg.ValuePtr() 336 if at.Write && mm.isPMACopyOnWriteLocked(vseg, pseg) { 337 // Break copy-on-write by copying. 338 if checkInvariants { 339 if !oldpma.maxPerms.Read { 340 panic(fmt.Sprintf("pma %v needs to be copied for writing, but is not readable: %v", pseg.Range(), oldpma)) 341 } 342 } 343 var copyAR hostarch.AddrRange 344 if vma := vseg.ValuePtr(); vma.effectivePerms.Execute { 345 // The majority of copy-on-write breaks on executable 346 // pages come from: 347 // 348 // - The ELF loader, which must zero out bytes on the 349 // last page of each segment after the end of the 350 // segment. 351 // 352 // - gdb's use of ptrace to insert breakpoints. 353 // 354 // Neither of these cases has enough spatial locality 355 // to benefit from copying nearby pages, so if the vma 356 // is executable, only copy the pages required. 357 copyAR = pseg.Range().Intersect(ar) 358 } else if vma.growsDown { 359 // In most cases, the new process will not use most of 360 // its stack before exiting or invoking execve(); it is 361 // especially unlikely to return very far down its call 362 // stack, since async-signal-safety concerns in 363 // multithreaded programs prevent the new process from 364 // being able to do much. So only copy up to one page 365 // before and after the pages required. 366 stackMaskAR := ar 367 if newStart := stackMaskAR.Start - hostarch.PageSize; newStart < stackMaskAR.Start { 368 stackMaskAR.Start = newStart 369 } 370 if newEnd := stackMaskAR.End + hostarch.PageSize; newEnd > stackMaskAR.End { 371 stackMaskAR.End = newEnd 372 } 373 copyAR = pseg.Range().Intersect(stackMaskAR) 374 } else { 375 copyAR = pseg.Range().Intersect(maskAR) 376 } 377 // Get internal mappings from the pma to copy from. 378 if err := pseg.getInternalMappingsLocked(); err != nil { 379 return pstart, pseg.PrevGap(), err 380 } 381 // Copy contents. 382 reader := safemem.BlockSeqReader{Blocks: mm.internalMappingsLocked(pseg, copyAR)} 383 fr, err := mm.mf.Allocate(uint64(copyAR.Length()), pgalloc.AllocOpts{ 384 Kind: usage.Anonymous, 385 Mode: pgalloc.AllocateAndWritePopulate, 386 MemCgID: memCgID, 387 ReaderFunc: reader.ReadToBlocks, 388 }) 389 if _, ok := err.(safecopy.BusError); ok { 390 // If we got SIGBUS during the copy, deliver SIGBUS to 391 // userspace (instead of SIGSEGV) if we're breaking 392 // copy-on-write due to application page fault. 393 err = &memmap.BusError{err} 394 } 395 if fr.Length() == 0 { 396 return pstart, pseg.PrevGap(), err 397 } 398 // Replace the pma with a copy in the part of the address 399 // range where copying was successful. This doesn't change 400 // RSS. 401 copyAR.End = copyAR.Start + hostarch.Addr(fr.Length()) 402 if copyAR != pseg.Range() { 403 pseg = mm.pmas.Isolate(pseg, copyAR) 404 pstart = pmaIterator{} // iterators invalidated 405 } 406 oldpma = pseg.ValuePtr() 407 unmapAR = joinAddrRanges(unmapAR, copyAR) 408 pfdrs = appendPendingFileDecRef(pfdrs, oldpma.file, pseg.fileRange()) 409 oldpma.file = mm.mf 410 oldpma.off = fr.Start 411 oldpma.translatePerms = hostarch.AnyAccess 412 oldpma.effectivePerms = vma.effectivePerms 413 oldpma.maxPerms = vma.maxPerms 414 oldpma.needCOW = false 415 oldpma.private = true 416 oldpma.internalMappings = safemem.BlockSeq{} 417 // Try to merge the pma with its neighbors. 418 if prev := pseg.PrevSegment(); prev.Ok() { 419 if merged := mm.pmas.Merge(prev, pseg); merged.Ok() { 420 pseg = merged 421 pstart = pmaIterator{} // iterators invalidated 422 } 423 } 424 if next := pseg.NextSegment(); next.Ok() { 425 if merged := mm.pmas.Merge(pseg, next); merged.Ok() { 426 pseg = merged 427 pstart = pmaIterator{} // iterators invalidated 428 } 429 } 430 // The error returned by AllocateAndFill is only 431 // significant if it occurred before ar.End. 432 if err != nil && pseg.End() < ar.End { 433 return pstart, pseg.NextGap(), err 434 } 435 // Ensure pseg and pgap are correct for the next iteration 436 // of the loop. 437 pseg, pgap = pseg.NextNonEmpty() 438 } else if !oldpma.translatePerms.SupersetOf(at) { 439 // Get new pmas (with sufficient permissions) by calling 440 // memmap.Mappable.Translate again. 441 if checkInvariants { 442 if oldpma.private { 443 panic(fmt.Sprintf("private pma %v has non-maximal pma.translatePerms: %v", pseg.Range(), oldpma)) 444 } 445 } 446 // Allow the entire pma to be replaced. 447 optAR := pseg.Range() 448 optMR := vseg.mappableRangeOf(optAR) 449 reqAR := optAR.Intersect(ar) 450 reqMR := vseg.mappableRangeOf(reqAR) 451 perms := oldpma.translatePerms.Union(at) 452 ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms) 453 if checkInvariants { 454 if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil { 455 panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err)) 456 } 457 } 458 // Remove the part of the existing pma covered by new 459 // Translations, then insert new pmas. This doesn't change 460 // RSS. 461 if len(ts) == 0 { 462 return pstart, pseg.PrevGap(), err 463 } 464 transMR := memmap.MappableRange{ts[0].Source.Start, ts[len(ts)-1].Source.End} 465 transAR := vseg.addrRangeOf(transMR) 466 pseg = mm.pmas.Isolate(pseg, transAR) 467 unmapAR = joinAddrRanges(unmapAR, transAR) 468 pfdrs = appendPendingFileDecRef(pfdrs, pseg.ValuePtr().file, pseg.fileRange()) 469 pgap = mm.pmas.Remove(pseg) 470 pstart = pmaIterator{} // iterators invalidated 471 for _, t := range ts { 472 newpmaAR := vseg.addrRangeOf(t.Source) 473 newpma := pma{ 474 file: t.File, 475 off: t.Offset, 476 translatePerms: t.Perms, 477 effectivePerms: vma.effectivePerms.Intersect(t.Perms), 478 maxPerms: vma.maxPerms.Intersect(t.Perms), 479 } 480 if vma.private { 481 newpma.effectivePerms.Write = false 482 newpma.maxPerms.Write = false 483 newpma.needCOW = true 484 } 485 t.File.IncRef(t.FileRange(), memCgID) 486 pseg = mm.pmas.Insert(pgap, newpmaAR, newpma) 487 pgap = pseg.NextGap() 488 } 489 // The error returned by Translate is only significant if 490 // it occurred before ar.End. 491 if err != nil && pseg.End() < ar.End { 492 return pstart, pgap, err 493 } 494 // Ensure pseg and pgap are correct for the next iteration 495 // of the loop. 496 if pgap.Range().Length() == 0 { 497 pseg, pgap = pgap.NextSegment(), pmaGapIterator{} 498 } else { 499 pseg = pmaIterator{} 500 } 501 } else { 502 // We have a usable pma; continue. 503 pseg, pgap = pseg.NextNonEmpty() 504 } 505 506 default: 507 break pmaLoop 508 } 509 } 510 // Go to the next vma. 511 if ar.End <= vseg.End() { 512 if pgap.Ok() { 513 return pstart, pgap, nil 514 } 515 return pstart, pseg.PrevGap(), nil 516 } 517 vseg = vseg.NextSegment() 518 } 519 } 520 521 const ( 522 // When memory is allocated for a private pma, align the allocated address 523 // range to a privateAllocUnit boundary when possible. Larger values of 524 // privateAllocUnit may reduce page faults by allowing fewer, larger pmas 525 // to be mapped, but may result in larger amounts of wasted memory in the 526 // presence of fragmentation. privateAllocUnit must be a power-of-2 527 // multiple of hostarch.PageSize. 528 privateAllocUnit = hostarch.HugePageSize 529 530 privateAllocMask = privateAllocUnit - 1 531 ) 532 533 func privateAligned(ar hostarch.AddrRange) hostarch.AddrRange { 534 aligned := hostarch.AddrRange{ar.Start &^ privateAllocMask, ar.End} 535 if end := (ar.End + privateAllocMask) &^ privateAllocMask; end >= ar.End { 536 aligned.End = end 537 } 538 if checkInvariants { 539 if !aligned.IsSupersetOf(ar) { 540 panic(fmt.Sprintf("aligned AddrRange %#v is not a superset of ar %#v", aligned, ar)) 541 } 542 } 543 return aligned 544 } 545 546 // isPMACopyOnWriteLocked returns true if the contents of the pma represented 547 // by pseg must be copied to a new private pma to be written to. 548 // 549 // If the pma is a copy-on-write private pma, and holds the only reference on 550 // the memory it maps, isPMACopyOnWriteLocked will take ownership of the memory 551 // and update the pma to indicate that it does not require copy-on-write. 552 // 553 // Preconditions: 554 // - vseg.Range().IsSupersetOf(pseg.Range()). 555 // - mm.mappingMu must be locked. 556 // - mm.activeMu must be locked for writing. 557 func (mm *MemoryManager) isPMACopyOnWriteLocked(vseg vmaIterator, pseg pmaIterator) bool { 558 pma := pseg.ValuePtr() 559 if !pma.needCOW { 560 return false 561 } 562 if !pma.private { 563 return true 564 } 565 // If we have the only reference on private memory to be copied, just take 566 // ownership of it instead of copying. If we do hold the only reference, 567 // additional references can only be taken by mm.Fork(), which is excluded 568 // by mm.activeMu, so this isn't racy. 569 if mm.mf.HasUniqueRef(pseg.fileRange()) { 570 pma.needCOW = false 571 // pma.private => pma.translatePerms == hostarch.AnyAccess 572 vma := vseg.ValuePtr() 573 pma.effectivePerms = vma.effectivePerms 574 pma.maxPerms = vma.maxPerms 575 return false 576 } 577 return true 578 } 579 580 // Invalidate implements memmap.MappingSpace.Invalidate. 581 func (mm *MemoryManager) Invalidate(ar hostarch.AddrRange, opts memmap.InvalidateOpts) { 582 if checkInvariants { 583 if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { 584 panic(fmt.Sprintf("invalid ar: %v", ar)) 585 } 586 } 587 588 mm.activeMu.Lock() 589 defer mm.activeMu.Unlock() 590 if mm.captureInvalidations { 591 mm.capturedInvalidations = append(mm.capturedInvalidations, invalidateArgs{ar, opts}) 592 return 593 } 594 mm.invalidateLocked(ar, opts.InvalidatePrivate, true) 595 } 596 597 // invalidateLocked removes pmas and AddressSpace mappings of those pmas for 598 // addresses in ar. 599 // 600 // Preconditions: 601 // - mm.activeMu must be locked for writing. 602 // - ar.Length() != 0. 603 // - ar must be page-aligned. 604 func (mm *MemoryManager) invalidateLocked(ar hostarch.AddrRange, invalidatePrivate, invalidateShared bool) { 605 if checkInvariants { 606 if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { 607 panic(fmt.Sprintf("invalid ar: %v", ar)) 608 } 609 } 610 611 var didUnmapAS bool 612 pseg := mm.pmas.LowerBoundSegment(ar.Start) 613 for pseg.Ok() && pseg.Start() < ar.End { 614 pma := pseg.ValuePtr() 615 if (invalidatePrivate && pma.private) || (invalidateShared && !pma.private) { 616 pseg = mm.pmas.Isolate(pseg, ar) 617 pma = pseg.ValuePtr() 618 if !didUnmapAS { 619 // Unmap all of ar, not just pseg.Range(), to minimize host 620 // syscalls. AddressSpace mappings must be removed before 621 // pma.file.DecRef(). 622 // 623 // Note that we do more than just ar here, and extrapolate 624 // to the end of any previous region that we may have mapped. 625 // This is done to ensure that lower layers can fully invalidate 626 // intermediate pagetable pages during the unmap. 627 var unmapAR hostarch.AddrRange 628 if prev := pseg.PrevSegment(); prev.Ok() { 629 unmapAR.Start = prev.End() 630 } else { 631 unmapAR.Start = mm.layout.MinAddr 632 } 633 if last := mm.pmas.LowerBoundSegment(ar.End); last.Ok() { 634 if last.Start() < ar.End { 635 unmapAR.End = ar.End 636 } else { 637 unmapAR.End = last.Start() 638 } 639 } else { 640 unmapAR.End = mm.layout.MaxAddr 641 } 642 mm.unmapASLocked(unmapAR) 643 didUnmapAS = true 644 } 645 mm.removeRSSLocked(pseg.Range()) 646 pma.file.DecRef(pseg.fileRange()) 647 pseg = mm.pmas.Remove(pseg).NextSegment() 648 } else { 649 pseg = pseg.NextSegment() 650 } 651 } 652 } 653 654 // Pin returns the memmap.File ranges currently mapped by addresses in ar in 655 // mm, acquiring a reference on the returned ranges which the caller must 656 // release by calling Unpin. If not all addresses are mapped, Pin returns a 657 // non-nil error. Note that Pin may return both a non-empty slice of 658 // PinnedRanges and a non-nil error. 659 // 660 // Pin does not prevent mapped ranges from changing, making it unsuitable for 661 // most I/O. It should only be used in contexts that would use get_user_pages() 662 // in the Linux kernel. 663 // 664 // Preconditions: 665 // - ar.Length() != 0. 666 // - ar must be page-aligned. 667 func (mm *MemoryManager) Pin(ctx context.Context, ar hostarch.AddrRange, at hostarch.AccessType, ignorePermissions bool) ([]PinnedRange, error) { 668 if checkInvariants { 669 if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { 670 panic(fmt.Sprintf("invalid ar: %v", ar)) 671 } 672 } 673 674 // Ensure that we have usable vmas. 675 mm.mappingMu.RLock() 676 vseg, vend, verr := mm.getVMAsLocked(ctx, ar, at, ignorePermissions) 677 if vendaddr := vend.Start(); vendaddr < ar.End { 678 if vendaddr <= ar.Start { 679 mm.mappingMu.RUnlock() 680 return nil, verr 681 } 682 ar.End = vendaddr 683 } 684 685 // Ensure that we have usable pmas. 686 mm.activeMu.Lock() 687 pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, at) 688 mm.mappingMu.RUnlock() 689 if pendaddr := pend.Start(); pendaddr < ar.End { 690 if pendaddr <= ar.Start { 691 mm.activeMu.Unlock() 692 return nil, perr 693 } 694 ar.End = pendaddr 695 } 696 697 memCgID := pgalloc.MemoryCgroupIDFromContext(ctx) 698 // Gather pmas. 699 var prs []PinnedRange 700 for pseg.Ok() && pseg.Start() < ar.End { 701 psar := pseg.Range().Intersect(ar) 702 f := pseg.ValuePtr().file 703 fr := pseg.fileRangeOf(psar) 704 f.IncRef(fr, memCgID) 705 prs = append(prs, PinnedRange{ 706 Source: psar, 707 File: f, 708 Offset: fr.Start, 709 }) 710 pseg = pseg.NextSegment() 711 } 712 mm.activeMu.Unlock() 713 714 // Return the first error in order of progress through ar. 715 if perr != nil { 716 return prs, perr 717 } 718 return prs, verr 719 } 720 721 // PinnedRanges are returned by MemoryManager.Pin. 722 type PinnedRange struct { 723 // Source is the corresponding range of addresses. 724 Source hostarch.AddrRange 725 726 // File is the mapped file. 727 File memmap.File 728 729 // Offset is the offset into File at which this PinnedRange begins. 730 Offset uint64 731 } 732 733 // FileRange returns the memmap.File offsets mapped by pr. 734 func (pr PinnedRange) FileRange() memmap.FileRange { 735 return memmap.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())} 736 } 737 738 // Unpin releases the reference held by prs. 739 func Unpin(prs []PinnedRange) { 740 for i := range prs { 741 prs[i].File.DecRef(prs[i].FileRange()) 742 } 743 } 744 745 // movePMAsLocked moves all pmas in oldAR to newAR. 746 // 747 // Preconditions: 748 // - mm.activeMu must be locked for writing. 749 // - oldAR.Length() != 0. 750 // - oldAR.Length() <= newAR.Length(). 751 // - !oldAR.Overlaps(newAR). 752 // - mm.pmas.IsEmptyRange(newAR). 753 // - oldAR and newAR must be page-aligned. 754 func (mm *MemoryManager) movePMAsLocked(oldAR, newAR hostarch.AddrRange) { 755 if checkInvariants { 756 if !oldAR.WellFormed() || oldAR.Length() == 0 || !oldAR.IsPageAligned() { 757 panic(fmt.Sprintf("invalid oldAR: %v", oldAR)) 758 } 759 if !newAR.WellFormed() || newAR.Length() == 0 || !newAR.IsPageAligned() { 760 panic(fmt.Sprintf("invalid newAR: %v", newAR)) 761 } 762 if oldAR.Length() > newAR.Length() { 763 panic(fmt.Sprintf("old address range %v may contain pmas that will not fit in new address range %v", oldAR, newAR)) 764 } 765 if oldAR.Overlaps(newAR) { 766 panic(fmt.Sprintf("old and new address ranges overlap: %v, %v", oldAR, newAR)) 767 } 768 // mm.pmas.IsEmptyRange is checked by mm.pmas.Insert. 769 } 770 771 type movedPMA struct { 772 oldAR hostarch.AddrRange 773 pma pma 774 } 775 var movedPMAs []movedPMA 776 pseg := mm.pmas.LowerBoundSegment(oldAR.Start) 777 for pseg.Ok() && pseg.Start() < oldAR.End { 778 pseg = mm.pmas.Isolate(pseg, oldAR) 779 movedPMAs = append(movedPMAs, movedPMA{ 780 oldAR: pseg.Range(), 781 pma: pseg.Value(), 782 }) 783 pseg = mm.pmas.Remove(pseg).NextSegment() 784 // No RSS change is needed since we're re-inserting the same pmas 785 // below. 786 } 787 788 off := newAR.Start - oldAR.Start 789 pgap := mm.pmas.FindGap(newAR.Start) 790 for i := range movedPMAs { 791 mpma := &movedPMAs[i] 792 pmaNewAR := hostarch.AddrRange{mpma.oldAR.Start + off, mpma.oldAR.End + off} 793 pgap = mm.pmas.Insert(pgap, pmaNewAR, mpma.pma).NextGap() 794 } 795 796 mm.unmapASLocked(oldAR) 797 } 798 799 // internalMappingsLocked returns cached internal mappings for addresses in ar. 800 // 801 // Preconditions: 802 // - mm.activeMu must be locked. 803 // - While mm.activeMu was locked, a call to 804 // existingPMAsLocked(needInternalMappings=true) succeeded for all 805 // addresses in ar. 806 // - ar.Length() != 0. 807 // - pseg.Range().Contains(ar.Start). 808 func (mm *MemoryManager) internalMappingsLocked(pseg pmaIterator, ar hostarch.AddrRange) safemem.BlockSeq { 809 if checkInvariants { 810 if !ar.WellFormed() || ar.Length() == 0 { 811 panic(fmt.Sprintf("invalid ar: %v", ar)) 812 } 813 if !pseg.Range().Contains(ar.Start) { 814 panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar)) 815 } 816 } 817 818 if ar.End <= pseg.End() { 819 // Since only one pma is involved, we can use pma.internalMappings 820 // directly, avoiding a slice allocation. 821 offset := uint64(ar.Start - pseg.Start()) 822 return pseg.ValuePtr().internalMappings.DropFirst64(offset).TakeFirst64(uint64(ar.Length())) 823 } 824 825 var ims []safemem.Block 826 for { 827 pr := pseg.Range().Intersect(ar) 828 for pims := pseg.ValuePtr().internalMappings.DropFirst64(uint64(pr.Start - pseg.Start())).TakeFirst64(uint64(pr.Length())); !pims.IsEmpty(); pims = pims.Tail() { 829 ims = append(ims, pims.Head()) 830 } 831 if ar.End <= pseg.End() { 832 break 833 } 834 pseg = pseg.NextSegment() 835 } 836 return safemem.BlockSeqFromSlice(ims) 837 } 838 839 // vecInternalMappingsLocked returns cached internal mappings for addresses in 840 // ars. 841 // 842 // Preconditions: 843 // - mm.activeMu must be locked. 844 // - While mm.activeMu was locked, a call to 845 // existingVecPMAsLocked(needInternalMappings=true) succeeded for all 846 // addresses in ars. 847 func (mm *MemoryManager) vecInternalMappingsLocked(ars hostarch.AddrRangeSeq) safemem.BlockSeq { 848 var ims []safemem.Block 849 for ; !ars.IsEmpty(); ars = ars.Tail() { 850 ar := ars.Head() 851 if ar.Length() == 0 { 852 continue 853 } 854 for pims := mm.internalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); !pims.IsEmpty(); pims = pims.Tail() { 855 ims = append(ims, pims.Head()) 856 } 857 } 858 return safemem.BlockSeqFromSlice(ims) 859 } 860 861 // addRSSLocked updates the current and maximum resident set size of a 862 // MemoryManager to reflect the insertion of a pma at ar. 863 // 864 // Preconditions: mm.activeMu must be locked for writing. 865 func (mm *MemoryManager) addRSSLocked(ar hostarch.AddrRange) { 866 mm.curRSS += uint64(ar.Length()) 867 if mm.curRSS > mm.maxRSS { 868 mm.maxRSS = mm.curRSS 869 } 870 } 871 872 // removeRSSLocked updates the current resident set size of a MemoryManager to 873 // reflect the removal of a pma at ar. 874 // 875 // Preconditions: mm.activeMu must be locked for writing. 876 func (mm *MemoryManager) removeRSSLocked(ar hostarch.AddrRange) { 877 mm.curRSS -= uint64(ar.Length()) 878 } 879 880 // pmaSetFunctions implements segment.Functions for pmaSet. 881 type pmaSetFunctions struct{} 882 883 func (pmaSetFunctions) MinKey() hostarch.Addr { 884 return 0 885 } 886 887 func (pmaSetFunctions) MaxKey() hostarch.Addr { 888 return ^hostarch.Addr(0) 889 } 890 891 func (pmaSetFunctions) ClearValue(pma *pma) { 892 pma.file = nil 893 pma.internalMappings = safemem.BlockSeq{} 894 } 895 896 func (pmaSetFunctions) Merge(ar1 hostarch.AddrRange, pma1 pma, ar2 hostarch.AddrRange, pma2 pma) (pma, bool) { 897 if pma1.file != pma2.file || 898 pma1.off+uint64(ar1.Length()) != pma2.off || 899 pma1.translatePerms != pma2.translatePerms || 900 pma1.effectivePerms != pma2.effectivePerms || 901 pma1.maxPerms != pma2.maxPerms || 902 pma1.needCOW != pma2.needCOW || 903 pma1.private != pma2.private { 904 return pma{}, false 905 } 906 907 // Discard internal mappings instead of trying to merge them, since merging 908 // them requires an allocation and getting them again from the 909 // memmap.File might not. 910 pma1.internalMappings = safemem.BlockSeq{} 911 return pma1, true 912 } 913 914 func (pmaSetFunctions) Split(ar hostarch.AddrRange, p pma, split hostarch.Addr) (pma, pma) { 915 newlen1 := uint64(split - ar.Start) 916 p2 := p 917 p2.off += newlen1 918 if !p.internalMappings.IsEmpty() { 919 p.internalMappings = p.internalMappings.TakeFirst64(newlen1) 920 p2.internalMappings = p2.internalMappings.DropFirst64(newlen1) 921 } 922 return p, p2 923 } 924 925 // findOrSeekPrevUpperBoundPMA returns mm.pmas.UpperBoundSegment(addr), but may do 926 // so by scanning linearly backward from pgap. 927 // 928 // Preconditions: 929 // - mm.activeMu must be locked. 930 // - addr <= pgap.Start(). 931 func (mm *MemoryManager) findOrSeekPrevUpperBoundPMA(addr hostarch.Addr, pgap pmaGapIterator) pmaIterator { 932 if checkInvariants { 933 if !pgap.Ok() { 934 panic("terminal pma iterator") 935 } 936 if addr > pgap.Start() { 937 panic(fmt.Sprintf("can't seek backward to %#x from %#x", addr, pgap.Start())) 938 } 939 } 940 // Optimistically check if pgap.PrevSegment() is the PMA we're looking for, 941 // which is the case if findOrSeekPrevUpperBoundPMA is called to find the 942 // start of a range containing only a single PMA. 943 if pseg := pgap.PrevSegment(); pseg.Start() <= addr { 944 return pseg 945 } 946 return mm.pmas.UpperBoundSegment(addr) 947 } 948 949 // getInternalMappingsLocked ensures that pseg.ValuePtr().internalMappings is 950 // non-empty. 951 // 952 // Preconditions: mm.activeMu must be locked for writing. 953 func (pseg pmaIterator) getInternalMappingsLocked() error { 954 pma := pseg.ValuePtr() 955 if pma.internalMappings.IsEmpty() { 956 // This must use maxPerms (instead of perms) because some permission 957 // constraints are only visible to vmas; for example, mappings of 958 // read-only files have vma.maxPerms.Write unset, but this may not be 959 // visible to the memmap.Mappable. 960 perms := pma.maxPerms 961 // We will never execute application code through an internal mapping. 962 perms.Execute = false 963 ims, err := pma.file.MapInternal(pseg.fileRange(), perms) 964 if err != nil { 965 return err 966 } 967 pma.internalMappings = ims 968 } 969 return nil 970 } 971 972 func (pseg pmaIterator) fileRange() memmap.FileRange { 973 return pseg.fileRangeOf(pseg.Range()) 974 } 975 976 // Preconditions: 977 // - pseg.Range().IsSupersetOf(ar). 978 // - ar.Length != 0. 979 func (pseg pmaIterator) fileRangeOf(ar hostarch.AddrRange) memmap.FileRange { 980 if checkInvariants { 981 if !pseg.Ok() { 982 panic("terminal pma iterator") 983 } 984 if !ar.WellFormed() || ar.Length() == 0 { 985 panic(fmt.Sprintf("invalid ar: %v", ar)) 986 } 987 if !pseg.Range().IsSupersetOf(ar) { 988 panic(fmt.Sprintf("ar %v out of bounds %v", ar, pseg.Range())) 989 } 990 } 991 992 pma := pseg.ValuePtr() 993 pstart := pseg.Start() 994 return memmap.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)} 995 } 996 997 // joinAddrRanges returns the smallest hostarch.AddrRange that is a superset of 998 // both ar1 and ar2. If either ar1 or ar2 have length 0, joinAddrRanges returns 999 // the other range. If both ar1 and ar2 have length 0, joinAddrRanges returns 1000 // an unspecified range with length 0. 1001 func joinAddrRanges(ar1, ar2 hostarch.AddrRange) hostarch.AddrRange { 1002 if ar1.Length() == 0 { 1003 return ar2 1004 } 1005 if ar2.Length() == 0 { 1006 return ar1 1007 } 1008 ar := ar1 1009 if ar.Start > ar2.Start { 1010 ar.Start = ar2.Start 1011 } 1012 if ar.End < ar2.End { 1013 ar.End = ar2.End 1014 } 1015 if checkInvariants { 1016 if !ar.IsSupersetOf(ar1) || !ar.IsSupersetOf(ar2) { 1017 panic(fmt.Sprintf("%v is not a superset of both %v and %v", ar, ar1, ar2)) 1018 } 1019 } 1020 return ar 1021 } 1022 1023 // pendingFileDecRefs accumulates released memmap.FileRange references so that 1024 // calls to memmap.File.DecRef() can occur without holding locks. 1025 type pendingFileDecRefs struct { 1026 slice []pendingFileDecRef 1027 } 1028 1029 type pendingFileDecRef struct { 1030 file memmap.File 1031 fr memmap.FileRange 1032 } 1033 1034 var pendingFileDecRefsPool = sync.Pool{ 1035 New: func() any { 1036 return &pendingFileDecRefs{} 1037 }, 1038 } 1039 1040 func appendPendingFileDecRef(pfdrs *pendingFileDecRefs, file memmap.File, fr memmap.FileRange) *pendingFileDecRefs { 1041 if pfdrs == nil { 1042 pfdrs = pendingFileDecRefsPool.Get().(*pendingFileDecRefs) 1043 } 1044 pfdrs.slice = append(pfdrs.slice, pendingFileDecRef{file, fr}) 1045 return pfdrs 1046 } 1047 1048 // Cleanup releases all references accumulated by pfdrs and releases ownership 1049 // of pfdrs. pfdrs may be nil. 1050 // 1051 // Preconditions: No AddressSpace ranges may be awaiting unmapping (since such 1052 // ranges may refer to memmap.File pages that will be dropped.) 1053 func (pfdrs *pendingFileDecRefs) Cleanup() { 1054 if pfdrs == nil { 1055 return 1056 } 1057 for i := range pfdrs.slice { 1058 pfdr := &pfdrs.slice[i] 1059 pfdr.file.DecRef(pfdr.fr) 1060 pfdr.file = nil // allow GC 1061 } 1062 pfdrs.slice = pfdrs.slice[:0] 1063 pendingFileDecRefsPool.Put(pfdrs) 1064 }