github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/mm/pma.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package mm 16 17 import ( 18 "fmt" 19 "sync/atomic" 20 21 "github.com/metacubex/gvisor/pkg/context" 22 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 23 "github.com/metacubex/gvisor/pkg/hostarch" 24 "github.com/metacubex/gvisor/pkg/safecopy" 25 "github.com/metacubex/gvisor/pkg/safemem" 26 "github.com/metacubex/gvisor/pkg/sentry/memmap" 27 "github.com/metacubex/gvisor/pkg/sentry/pgalloc" 28 "github.com/metacubex/gvisor/pkg/sentry/usage" 29 ) 30 31 // existingPMAsLocked checks that pmas exist for all addresses in ar, and 32 // support access of type (at, ignorePermissions). If so, it returns an 33 // iterator to the pma containing ar.Start. Otherwise it returns a terminal 34 // iterator. 35 // 36 // Preconditions: 37 // - mm.activeMu must be locked. 38 // - ar.Length() != 0. 39 func (mm *MemoryManager) existingPMAsLocked(ar hostarch.AddrRange, at hostarch.AccessType, ignorePermissions bool, needInternalMappings bool) pmaIterator { 40 if checkInvariants { 41 if !ar.WellFormed() || ar.Length() == 0 { 42 panic(fmt.Sprintf("invalid ar: %v", ar)) 43 } 44 } 45 46 first := mm.pmas.FindSegment(ar.Start) 47 pseg := first 48 for pseg.Ok() { 49 pma := pseg.ValuePtr() 50 perms := pma.effectivePerms 51 if ignorePermissions { 52 perms = pma.maxPerms 53 } 54 if !perms.SupersetOf(at) { 55 return pmaIterator{} 56 } 57 if needInternalMappings && pma.internalMappings.IsEmpty() { 58 return pmaIterator{} 59 } 60 61 if ar.End <= pseg.End() { 62 return first 63 } 64 pseg, _ = pseg.NextNonEmpty() 65 } 66 67 // Ran out of pmas before reaching ar.End. 68 return pmaIterator{} 69 } 70 71 // existingVecPMAsLocked returns true if pmas exist for all addresses in ars, 72 // and support access of type (at, ignorePermissions). 73 // 74 // Preconditions: mm.activeMu must be locked. 75 func (mm *MemoryManager) existingVecPMAsLocked(ars hostarch.AddrRangeSeq, at hostarch.AccessType, ignorePermissions bool, needInternalMappings bool) bool { 76 for ; !ars.IsEmpty(); ars = ars.Tail() { 77 if ar := ars.Head(); ar.Length() != 0 && !mm.existingPMAsLocked(ar, at, ignorePermissions, needInternalMappings).Ok() { 78 return false 79 } 80 } 81 return true 82 } 83 84 // getPMAsLocked ensures that pmas exist for all addresses in ar, and support 85 // access of type at. It returns: 86 // 87 // - An iterator to the pma containing ar.Start. If no pma contains ar.Start, 88 // the iterator is unspecified. 89 // 90 // - An iterator to the gap after the last pma containing an address in ar. If 91 // pmas exist for no addresses in ar, the iterator is to a gap that begins 92 // before ar.Start. 93 // 94 // - An error that is non-nil if pmas exist for only a subset of ar. 95 // 96 // Preconditions: 97 // - mm.mappingMu must be locked. 98 // - mm.activeMu must be locked for writing. 99 // - ar.Length() != 0. 100 // - vseg.Range().Contains(ar.Start). 101 // - vmas must exist for all addresses in ar, and support accesses of type at 102 // (i.e. permission checks must have been performed against vmas). 103 func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, at hostarch.AccessType) (pmaIterator, pmaGapIterator, error) { 104 if checkInvariants { 105 if !ar.WellFormed() || ar.Length() == 0 { 106 panic(fmt.Sprintf("invalid ar: %v", ar)) 107 } 108 if !vseg.Ok() { 109 panic("terminal vma iterator") 110 } 111 if !vseg.Range().Contains(ar.Start) { 112 panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar)) 113 } 114 } 115 116 // Page-align ar so that all AddrRanges are aligned. 117 end, ok := ar.End.RoundUp() 118 var alignerr error 119 if !ok { 120 end = ar.End.RoundDown() 121 alignerr = linuxerr.EFAULT 122 } 123 ar = hostarch.AddrRange{ar.Start.RoundDown(), end} 124 125 pstart, pend, perr := mm.getPMAsInternalLocked(ctx, vseg, ar, at) 126 if pend.Start() <= ar.Start { 127 return pmaIterator{}, pend, perr 128 } 129 // getPMAsInternalLocked may not have returned pstart due to iterator 130 // invalidation. 131 if !pstart.Ok() { 132 pstart = mm.findOrSeekPrevUpperBoundPMA(ar.Start, pend) 133 } 134 if perr != nil { 135 return pstart, pend, perr 136 } 137 return pstart, pend, alignerr 138 } 139 140 // getVecPMAsLocked ensures that pmas exist for all addresses in ars, and 141 // support access of type at. It returns the subset of ars for which pmas 142 // exist. If this is not equal to ars, it returns a non-nil error explaining 143 // why. 144 // 145 // Preconditions: 146 // - mm.mappingMu must be locked. 147 // - mm.activeMu must be locked for writing. 148 // - vmas must exist for all addresses in ars, and support accesses of type at 149 // (i.e. permission checks must have been performed against vmas). 150 func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars hostarch.AddrRangeSeq, at hostarch.AccessType) (hostarch.AddrRangeSeq, error) { 151 for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() { 152 ar := arsit.Head() 153 if ar.Length() == 0 { 154 continue 155 } 156 if checkInvariants { 157 if !ar.WellFormed() { 158 panic(fmt.Sprintf("invalid ar: %v", ar)) 159 } 160 } 161 162 // Page-align ar so that all AddrRanges are aligned. 163 end, ok := ar.End.RoundUp() 164 var alignerr error 165 if !ok { 166 end = ar.End.RoundDown() 167 alignerr = linuxerr.EFAULT 168 } 169 ar = hostarch.AddrRange{ar.Start.RoundDown(), end} 170 171 _, pend, perr := mm.getPMAsInternalLocked(ctx, mm.vmas.FindSegment(ar.Start), ar, at) 172 if perr != nil { 173 return truncatedAddrRangeSeq(ars, arsit, pend.Start()), perr 174 } 175 if alignerr != nil { 176 return truncatedAddrRangeSeq(ars, arsit, pend.Start()), alignerr 177 } 178 } 179 180 return ars, nil 181 } 182 183 // getPMAsInternalLocked is equivalent to getPMAsLocked, with the following 184 // exceptions: 185 // 186 // - getPMAsInternalLocked returns a pmaIterator on a best-effort basis (that 187 // is, the returned iterator may be terminal, even if a pma that contains 188 // ar.Start exists). Returning this iterator on a best-effort basis allows 189 // callers that require it to use it when it's cheaply available, while also 190 // avoiding the overhead of retrieving it when it's not. 191 // 192 // - getPMAsInternalLocked additionally requires that ar is page-aligned. 193 // getPMAsInternalLocked is an implementation helper for getPMAsLocked and 194 // getVecPMAsLocked; other clients should call one of those instead. 195 func (mm *MemoryManager) getPMAsInternalLocked(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, at hostarch.AccessType) (pmaIterator, pmaGapIterator, error) { 196 if checkInvariants { 197 if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { 198 panic(fmt.Sprintf("invalid ar: %v", ar)) 199 } 200 if !vseg.Ok() { 201 panic("terminal vma iterator") 202 } 203 if !vseg.Range().Contains(ar.Start) { 204 panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar)) 205 } 206 } 207 208 memCgID := pgalloc.MemoryCgroupIDFromContext(ctx) 209 opts := pgalloc.AllocOpts{Kind: usage.Anonymous, Dir: pgalloc.BottomUp, MemCgID: memCgID} 210 vma := vseg.ValuePtr() 211 if uintptr(ar.Start) < atomic.LoadUintptr(&vma.lastFault) { 212 // Detect cases where memory is accessed downwards and change memory file 213 // allocation order to increase the chances that pages are coalesced. 214 opts.Dir = pgalloc.TopDown 215 } 216 atomic.StoreUintptr(&vma.lastFault, uintptr(ar.Start)) 217 218 // Limit the range we allocate to ar, aligned to privateAllocUnit. 219 maskAR := privateAligned(ar) 220 didUnmapAS := false 221 // The range in which we iterate vmas and pmas is still limited to ar, to 222 // ensure that we don't allocate or COW-break a pma we don't need. 223 pseg, pgap := mm.pmas.Find(ar.Start) 224 pstart := pseg 225 for { 226 // Get pmas for this vma. 227 vsegAR := vseg.Range().Intersect(ar) 228 vma := vseg.ValuePtr() 229 pmaLoop: 230 for { 231 switch { 232 case pgap.Ok() && pgap.Start() < vsegAR.End: 233 // Need a pma here. 234 optAR := vseg.Range().Intersect(pgap.Range()) 235 if checkInvariants { 236 if optAR.Length() == 0 { 237 panic(fmt.Sprintf("vseg %v and pgap %v do not overlap", vseg, pgap)) 238 } 239 } 240 if vma.mappable == nil { 241 // Private anonymous mappings get pmas by allocating. 242 allocAR := optAR.Intersect(maskAR) 243 fr, err := mm.mf.Allocate(uint64(allocAR.Length()), opts) 244 if err != nil { 245 return pstart, pgap, err 246 } 247 if checkInvariants { 248 if !fr.WellFormed() || fr.Length() != uint64(allocAR.Length()) { 249 panic(fmt.Sprintf("Allocate(%v) returned invalid FileRange %v", allocAR.Length(), fr)) 250 } 251 } 252 mm.addRSSLocked(allocAR) 253 pseg, pgap = mm.pmas.Insert(pgap, allocAR, pma{ 254 file: mm.mf, 255 off: fr.Start, 256 translatePerms: hostarch.AnyAccess, 257 effectivePerms: vma.effectivePerms, 258 maxPerms: vma.maxPerms, 259 // Since we just allocated this memory and have the 260 // only reference, the new pma does not need 261 // copy-on-write. 262 private: true, 263 }).NextNonEmpty() 264 pstart = pmaIterator{} // iterators invalidated 265 } else { 266 // Other mappings get pmas by translating. 267 optMR := vseg.mappableRangeOf(optAR) 268 reqAR := optAR.Intersect(ar) 269 reqMR := vseg.mappableRangeOf(reqAR) 270 perms := at 271 if vma.private { 272 // This pma will be copy-on-write; don't require write 273 // permission, but do require read permission to 274 // facilitate the copy. 275 // 276 // If at.Write is true, we will need to break 277 // copy-on-write immediately, which occurs after 278 // translation below. 279 perms.Read = true 280 perms.Write = false 281 } 282 ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms) 283 if checkInvariants { 284 if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil { 285 panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err)) 286 } 287 } 288 // Install a pma for each translation. 289 if len(ts) == 0 { 290 return pstart, pgap, err 291 } 292 pstart = pmaIterator{} // iterators invalidated 293 for _, t := range ts { 294 newpmaAR := vseg.addrRangeOf(t.Source) 295 newpma := pma{ 296 file: t.File, 297 off: t.Offset, 298 translatePerms: t.Perms, 299 effectivePerms: vma.effectivePerms.Intersect(t.Perms), 300 maxPerms: vma.maxPerms.Intersect(t.Perms), 301 } 302 if vma.private { 303 newpma.effectivePerms.Write = false 304 newpma.maxPerms.Write = false 305 newpma.needCOW = true 306 } 307 mm.addRSSLocked(newpmaAR) 308 t.File.IncRef(t.FileRange(), memCgID) 309 // This is valid because memmap.Mappable.Translate is 310 // required to return Translations in increasing 311 // Translation.Source order. 312 pseg = mm.pmas.Insert(pgap, newpmaAR, newpma) 313 pgap = pseg.NextGap() 314 } 315 // The error returned by Translate is only significant if 316 // it occurred before ar.End. 317 if err != nil && vseg.addrRangeOf(ts[len(ts)-1].Source).End < ar.End { 318 return pstart, pgap, err 319 } 320 // Rewind pseg to the first pma inserted and continue the 321 // loop to check if we need to break copy-on-write. 322 pseg, pgap = mm.findOrSeekPrevUpperBoundPMA(vseg.addrRangeOf(ts[0].Source).Start, pgap), pmaGapIterator{} 323 continue 324 } 325 326 case pseg.Ok() && pseg.Start() < vsegAR.End: 327 oldpma := pseg.ValuePtr() 328 if at.Write && mm.isPMACopyOnWriteLocked(vseg, pseg) { 329 // Break copy-on-write by copying. 330 if checkInvariants { 331 if !oldpma.maxPerms.Read { 332 panic(fmt.Sprintf("pma %v needs to be copied for writing, but is not readable: %v", pseg.Range(), oldpma)) 333 } 334 } 335 var copyAR hostarch.AddrRange 336 if vma := vseg.ValuePtr(); vma.effectivePerms.Execute { 337 // The majority of copy-on-write breaks on executable 338 // pages come from: 339 // 340 // - The ELF loader, which must zero out bytes on the 341 // last page of each segment after the end of the 342 // segment. 343 // 344 // - gdb's use of ptrace to insert breakpoints. 345 // 346 // Neither of these cases has enough spatial locality 347 // to benefit from copying nearby pages, so if the vma 348 // is executable, only copy the pages required. 349 copyAR = pseg.Range().Intersect(ar) 350 } else if vma.growsDown { 351 // In most cases, the new process will not use most of 352 // its stack before exiting or invoking execve(); it is 353 // especially unlikely to return very far down its call 354 // stack, since async-signal-safety concerns in 355 // multithreaded programs prevent the new process from 356 // being able to do much. So only copy up to one page 357 // before and after the pages required. 358 stackMaskAR := ar 359 if newStart := stackMaskAR.Start - hostarch.PageSize; newStart < stackMaskAR.Start { 360 stackMaskAR.Start = newStart 361 } 362 if newEnd := stackMaskAR.End + hostarch.PageSize; newEnd > stackMaskAR.End { 363 stackMaskAR.End = newEnd 364 } 365 copyAR = pseg.Range().Intersect(stackMaskAR) 366 } else { 367 copyAR = pseg.Range().Intersect(maskAR) 368 } 369 // Get internal mappings from the pma to copy from. 370 if err := pseg.getInternalMappingsLocked(); err != nil { 371 return pstart, pseg.PrevGap(), err 372 } 373 // Copy contents. 374 reader := safemem.BlockSeqReader{Blocks: mm.internalMappingsLocked(pseg, copyAR)} 375 fr, err := mm.mf.Allocate(uint64(copyAR.Length()), pgalloc.AllocOpts{ 376 Kind: usage.Anonymous, 377 Mode: pgalloc.AllocateAndWritePopulate, 378 MemCgID: memCgID, 379 ReaderFunc: reader.ReadToBlocks, 380 }) 381 if _, ok := err.(safecopy.BusError); ok { 382 // If we got SIGBUS during the copy, deliver SIGBUS to 383 // userspace (instead of SIGSEGV) if we're breaking 384 // copy-on-write due to application page fault. 385 err = &memmap.BusError{err} 386 } 387 if fr.Length() == 0 { 388 return pstart, pseg.PrevGap(), err 389 } 390 // Unmap all of maskAR, not just copyAR, to minimize host 391 // syscalls. AddressSpace mappings must be removed before 392 // oldpma.file.DecRef(). 393 if !didUnmapAS { 394 mm.unmapASLocked(maskAR) 395 didUnmapAS = true 396 } 397 // Replace the pma with a copy in the part of the address 398 // range where copying was successful. This doesn't change 399 // RSS. 400 copyAR.End = copyAR.Start + hostarch.Addr(fr.Length()) 401 if copyAR != pseg.Range() { 402 pseg = mm.pmas.Isolate(pseg, copyAR) 403 pstart = pmaIterator{} // iterators invalidated 404 } 405 oldpma = pseg.ValuePtr() 406 oldpma.file.DecRef(pseg.fileRange()) 407 oldpma.file = mm.mf 408 oldpma.off = fr.Start 409 oldpma.translatePerms = hostarch.AnyAccess 410 oldpma.effectivePerms = vma.effectivePerms 411 oldpma.maxPerms = vma.maxPerms 412 oldpma.needCOW = false 413 oldpma.private = true 414 oldpma.internalMappings = safemem.BlockSeq{} 415 // Try to merge the pma with its neighbors. 416 if prev := pseg.PrevSegment(); prev.Ok() { 417 if merged := mm.pmas.Merge(prev, pseg); merged.Ok() { 418 pseg = merged 419 pstart = pmaIterator{} // iterators invalidated 420 } 421 } 422 if next := pseg.NextSegment(); next.Ok() { 423 if merged := mm.pmas.Merge(pseg, next); merged.Ok() { 424 pseg = merged 425 pstart = pmaIterator{} // iterators invalidated 426 } 427 } 428 // The error returned by AllocateAndFill is only 429 // significant if it occurred before ar.End. 430 if err != nil && pseg.End() < ar.End { 431 return pstart, pseg.NextGap(), err 432 } 433 // Ensure pseg and pgap are correct for the next iteration 434 // of the loop. 435 pseg, pgap = pseg.NextNonEmpty() 436 } else if !oldpma.translatePerms.SupersetOf(at) { 437 // Get new pmas (with sufficient permissions) by calling 438 // memmap.Mappable.Translate again. 439 if checkInvariants { 440 if oldpma.private { 441 panic(fmt.Sprintf("private pma %v has non-maximal pma.translatePerms: %v", pseg.Range(), oldpma)) 442 } 443 } 444 // Allow the entire pma to be replaced. 445 optAR := pseg.Range() 446 optMR := vseg.mappableRangeOf(optAR) 447 reqAR := optAR.Intersect(ar) 448 reqMR := vseg.mappableRangeOf(reqAR) 449 perms := oldpma.translatePerms.Union(at) 450 ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms) 451 if checkInvariants { 452 if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil { 453 panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err)) 454 } 455 } 456 // Remove the part of the existing pma covered by new 457 // Translations, then insert new pmas. This doesn't change 458 // RSS. Note that we don't need to call unmapASLocked: any 459 // existing AddressSpace mappings are still valid (though 460 // less permissive than the new pmas indicate) until 461 // Invalidate is called, and will be replaced by future 462 // calls to mapASLocked. 463 if len(ts) == 0 { 464 return pstart, pseg.PrevGap(), err 465 } 466 transMR := memmap.MappableRange{ts[0].Source.Start, ts[len(ts)-1].Source.End} 467 transAR := vseg.addrRangeOf(transMR) 468 pseg = mm.pmas.Isolate(pseg, transAR) 469 pseg.ValuePtr().file.DecRef(pseg.fileRange()) 470 pgap = mm.pmas.Remove(pseg) 471 pstart = pmaIterator{} // iterators invalidated 472 for _, t := range ts { 473 newpmaAR := vseg.addrRangeOf(t.Source) 474 newpma := pma{ 475 file: t.File, 476 off: t.Offset, 477 translatePerms: t.Perms, 478 effectivePerms: vma.effectivePerms.Intersect(t.Perms), 479 maxPerms: vma.maxPerms.Intersect(t.Perms), 480 } 481 if vma.private { 482 newpma.effectivePerms.Write = false 483 newpma.maxPerms.Write = false 484 newpma.needCOW = true 485 } 486 t.File.IncRef(t.FileRange(), memCgID) 487 pseg = mm.pmas.Insert(pgap, newpmaAR, newpma) 488 pgap = pseg.NextGap() 489 } 490 // The error returned by Translate is only significant if 491 // it occurred before ar.End. 492 if err != nil && pseg.End() < ar.End { 493 return pstart, pgap, err 494 } 495 // Ensure pseg and pgap are correct for the next iteration 496 // of the loop. 497 if pgap.Range().Length() == 0 { 498 pseg, pgap = pgap.NextSegment(), pmaGapIterator{} 499 } else { 500 pseg = pmaIterator{} 501 } 502 } else { 503 // We have a usable pma; continue. 504 pseg, pgap = pseg.NextNonEmpty() 505 } 506 507 default: 508 break pmaLoop 509 } 510 } 511 // Go to the next vma. 512 if ar.End <= vseg.End() { 513 if pgap.Ok() { 514 return pstart, pgap, nil 515 } 516 return pstart, pseg.PrevGap(), nil 517 } 518 vseg = vseg.NextSegment() 519 } 520 } 521 522 const ( 523 // When memory is allocated for a private pma, align the allocated address 524 // range to a privateAllocUnit boundary when possible. Larger values of 525 // privateAllocUnit may reduce page faults by allowing fewer, larger pmas 526 // to be mapped, but may result in larger amounts of wasted memory in the 527 // presence of fragmentation. privateAllocUnit must be a power-of-2 528 // multiple of hostarch.PageSize. 529 privateAllocUnit = hostarch.HugePageSize 530 531 privateAllocMask = privateAllocUnit - 1 532 ) 533 534 func privateAligned(ar hostarch.AddrRange) hostarch.AddrRange { 535 aligned := hostarch.AddrRange{ar.Start &^ privateAllocMask, ar.End} 536 if end := (ar.End + privateAllocMask) &^ privateAllocMask; end >= ar.End { 537 aligned.End = end 538 } 539 if checkInvariants { 540 if !aligned.IsSupersetOf(ar) { 541 panic(fmt.Sprintf("aligned AddrRange %#v is not a superset of ar %#v", aligned, ar)) 542 } 543 } 544 return aligned 545 } 546 547 // isPMACopyOnWriteLocked returns true if the contents of the pma represented 548 // by pseg must be copied to a new private pma to be written to. 549 // 550 // If the pma is a copy-on-write private pma, and holds the only reference on 551 // the memory it maps, isPMACopyOnWriteLocked will take ownership of the memory 552 // and update the pma to indicate that it does not require copy-on-write. 553 // 554 // Preconditions: 555 // - vseg.Range().IsSupersetOf(pseg.Range()). 556 // - mm.mappingMu must be locked. 557 // - mm.activeMu must be locked for writing. 558 func (mm *MemoryManager) isPMACopyOnWriteLocked(vseg vmaIterator, pseg pmaIterator) bool { 559 pma := pseg.ValuePtr() 560 if !pma.needCOW { 561 return false 562 } 563 if !pma.private { 564 return true 565 } 566 // If we have the only reference on private memory to be copied, just take 567 // ownership of it instead of copying. If we do hold the only reference, 568 // additional references can only be taken by mm.Fork(), which is excluded 569 // by mm.activeMu, so this isn't racy. 570 if mm.mf.HasUniqueRef(pseg.fileRange()) { 571 pma.needCOW = false 572 // pma.private => pma.translatePerms == hostarch.AnyAccess 573 vma := vseg.ValuePtr() 574 pma.effectivePerms = vma.effectivePerms 575 pma.maxPerms = vma.maxPerms 576 return false 577 } 578 return true 579 } 580 581 // Invalidate implements memmap.MappingSpace.Invalidate. 582 func (mm *MemoryManager) Invalidate(ar hostarch.AddrRange, opts memmap.InvalidateOpts) { 583 if checkInvariants { 584 if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { 585 panic(fmt.Sprintf("invalid ar: %v", ar)) 586 } 587 } 588 589 mm.activeMu.Lock() 590 defer mm.activeMu.Unlock() 591 if mm.captureInvalidations { 592 mm.capturedInvalidations = append(mm.capturedInvalidations, invalidateArgs{ar, opts}) 593 return 594 } 595 mm.invalidateLocked(ar, opts.InvalidatePrivate, true) 596 } 597 598 // invalidateLocked removes pmas and AddressSpace mappings of those pmas for 599 // addresses in ar. 600 // 601 // Preconditions: 602 // - mm.activeMu must be locked for writing. 603 // - ar.Length() != 0. 604 // - ar must be page-aligned. 605 func (mm *MemoryManager) invalidateLocked(ar hostarch.AddrRange, invalidatePrivate, invalidateShared bool) { 606 if checkInvariants { 607 if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { 608 panic(fmt.Sprintf("invalid ar: %v", ar)) 609 } 610 } 611 612 var didUnmapAS bool 613 pseg := mm.pmas.LowerBoundSegment(ar.Start) 614 for pseg.Ok() && pseg.Start() < ar.End { 615 pma := pseg.ValuePtr() 616 if (invalidatePrivate && pma.private) || (invalidateShared && !pma.private) { 617 pseg = mm.pmas.Isolate(pseg, ar) 618 pma = pseg.ValuePtr() 619 if !didUnmapAS { 620 // Unmap all of ar, not just pseg.Range(), to minimize host 621 // syscalls. AddressSpace mappings must be removed before 622 // pma.file.DecRef(). 623 // 624 // Note that we do more than just ar here, and extrapolate 625 // to the end of any previous region that we may have mapped. 626 // This is done to ensure that lower layers can fully invalidate 627 // intermediate pagetable pages during the unmap. 628 var unmapAR hostarch.AddrRange 629 if prev := pseg.PrevSegment(); prev.Ok() { 630 unmapAR.Start = prev.End() 631 } else { 632 unmapAR.Start = mm.layout.MinAddr 633 } 634 if last := mm.pmas.LowerBoundSegment(ar.End); last.Ok() { 635 if last.Start() < ar.End { 636 unmapAR.End = ar.End 637 } else { 638 unmapAR.End = last.Start() 639 } 640 } else { 641 unmapAR.End = mm.layout.MaxAddr 642 } 643 mm.unmapASLocked(unmapAR) 644 didUnmapAS = true 645 } 646 mm.removeRSSLocked(pseg.Range()) 647 pma.file.DecRef(pseg.fileRange()) 648 pseg = mm.pmas.Remove(pseg).NextSegment() 649 } else { 650 pseg = pseg.NextSegment() 651 } 652 } 653 } 654 655 // Pin returns the memmap.File ranges currently mapped by addresses in ar in 656 // mm, acquiring a reference on the returned ranges which the caller must 657 // release by calling Unpin. If not all addresses are mapped, Pin returns a 658 // non-nil error. Note that Pin may return both a non-empty slice of 659 // PinnedRanges and a non-nil error. 660 // 661 // Pin does not prevent mapped ranges from changing, making it unsuitable for 662 // most I/O. It should only be used in contexts that would use get_user_pages() 663 // in the Linux kernel. 664 // 665 // Preconditions: 666 // - ar.Length() != 0. 667 // - ar must be page-aligned. 668 func (mm *MemoryManager) Pin(ctx context.Context, ar hostarch.AddrRange, at hostarch.AccessType, ignorePermissions bool) ([]PinnedRange, error) { 669 if checkInvariants { 670 if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { 671 panic(fmt.Sprintf("invalid ar: %v", ar)) 672 } 673 } 674 675 // Ensure that we have usable vmas. 676 mm.mappingMu.RLock() 677 vseg, vend, verr := mm.getVMAsLocked(ctx, ar, at, ignorePermissions) 678 if vendaddr := vend.Start(); vendaddr < ar.End { 679 if vendaddr <= ar.Start { 680 mm.mappingMu.RUnlock() 681 return nil, verr 682 } 683 ar.End = vendaddr 684 } 685 686 // Ensure that we have usable pmas. 687 mm.activeMu.Lock() 688 pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, at) 689 mm.mappingMu.RUnlock() 690 if pendaddr := pend.Start(); pendaddr < ar.End { 691 if pendaddr <= ar.Start { 692 mm.activeMu.Unlock() 693 return nil, perr 694 } 695 ar.End = pendaddr 696 } 697 698 memCgID := pgalloc.MemoryCgroupIDFromContext(ctx) 699 // Gather pmas. 700 var prs []PinnedRange 701 for pseg.Ok() && pseg.Start() < ar.End { 702 psar := pseg.Range().Intersect(ar) 703 f := pseg.ValuePtr().file 704 fr := pseg.fileRangeOf(psar) 705 f.IncRef(fr, memCgID) 706 prs = append(prs, PinnedRange{ 707 Source: psar, 708 File: f, 709 Offset: fr.Start, 710 }) 711 pseg = pseg.NextSegment() 712 } 713 mm.activeMu.Unlock() 714 715 // Return the first error in order of progress through ar. 716 if perr != nil { 717 return prs, perr 718 } 719 return prs, verr 720 } 721 722 // PinnedRanges are returned by MemoryManager.Pin. 723 type PinnedRange struct { 724 // Source is the corresponding range of addresses. 725 Source hostarch.AddrRange 726 727 // File is the mapped file. 728 File memmap.File 729 730 // Offset is the offset into File at which this PinnedRange begins. 731 Offset uint64 732 } 733 734 // FileRange returns the memmap.File offsets mapped by pr. 735 func (pr PinnedRange) FileRange() memmap.FileRange { 736 return memmap.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())} 737 } 738 739 // Unpin releases the reference held by prs. 740 func Unpin(prs []PinnedRange) { 741 for i := range prs { 742 prs[i].File.DecRef(prs[i].FileRange()) 743 } 744 } 745 746 // movePMAsLocked moves all pmas in oldAR to newAR. 747 // 748 // Preconditions: 749 // - mm.activeMu must be locked for writing. 750 // - oldAR.Length() != 0. 751 // - oldAR.Length() <= newAR.Length(). 752 // - !oldAR.Overlaps(newAR). 753 // - mm.pmas.IsEmptyRange(newAR). 754 // - oldAR and newAR must be page-aligned. 755 func (mm *MemoryManager) movePMAsLocked(oldAR, newAR hostarch.AddrRange) { 756 if checkInvariants { 757 if !oldAR.WellFormed() || oldAR.Length() == 0 || !oldAR.IsPageAligned() { 758 panic(fmt.Sprintf("invalid oldAR: %v", oldAR)) 759 } 760 if !newAR.WellFormed() || newAR.Length() == 0 || !newAR.IsPageAligned() { 761 panic(fmt.Sprintf("invalid newAR: %v", newAR)) 762 } 763 if oldAR.Length() > newAR.Length() { 764 panic(fmt.Sprintf("old address range %v may contain pmas that will not fit in new address range %v", oldAR, newAR)) 765 } 766 if oldAR.Overlaps(newAR) { 767 panic(fmt.Sprintf("old and new address ranges overlap: %v, %v", oldAR, newAR)) 768 } 769 // mm.pmas.IsEmptyRange is checked by mm.pmas.Insert. 770 } 771 772 type movedPMA struct { 773 oldAR hostarch.AddrRange 774 pma pma 775 } 776 var movedPMAs []movedPMA 777 pseg := mm.pmas.LowerBoundSegment(oldAR.Start) 778 for pseg.Ok() && pseg.Start() < oldAR.End { 779 pseg = mm.pmas.Isolate(pseg, oldAR) 780 movedPMAs = append(movedPMAs, movedPMA{ 781 oldAR: pseg.Range(), 782 pma: pseg.Value(), 783 }) 784 pseg = mm.pmas.Remove(pseg).NextSegment() 785 // No RSS change is needed since we're re-inserting the same pmas 786 // below. 787 } 788 789 off := newAR.Start - oldAR.Start 790 pgap := mm.pmas.FindGap(newAR.Start) 791 for i := range movedPMAs { 792 mpma := &movedPMAs[i] 793 pmaNewAR := hostarch.AddrRange{mpma.oldAR.Start + off, mpma.oldAR.End + off} 794 pgap = mm.pmas.Insert(pgap, pmaNewAR, mpma.pma).NextGap() 795 } 796 797 mm.unmapASLocked(oldAR) 798 } 799 800 // getPMAInternalMappingsLocked ensures that pmas for all addresses in ar have 801 // cached internal mappings. It returns: 802 // 803 // - An iterator to the gap after the last pma with internal mappings 804 // containing an address in ar. If internal mappings exist for no addresses in 805 // ar, the iterator is to a gap that begins before ar.Start. 806 // 807 // - An error that is non-nil if internal mappings exist for only a subset of 808 // ar. 809 // 810 // Preconditions: 811 // - mm.activeMu must be locked for writing. 812 // - pseg.Range().Contains(ar.Start). 813 // - pmas must exist for all addresses in ar. 814 // - ar.Length() != 0. 815 // 816 // Postconditions: getPMAInternalMappingsLocked does not invalidate iterators 817 // into mm.pmas. 818 func (mm *MemoryManager) getPMAInternalMappingsLocked(pseg pmaIterator, ar hostarch.AddrRange) (pmaGapIterator, error) { 819 if checkInvariants { 820 if !ar.WellFormed() || ar.Length() == 0 { 821 panic(fmt.Sprintf("invalid ar: %v", ar)) 822 } 823 if !pseg.Range().Contains(ar.Start) { 824 panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar)) 825 } 826 } 827 828 for { 829 if err := pseg.getInternalMappingsLocked(); err != nil { 830 return pseg.PrevGap(), err 831 } 832 if ar.End <= pseg.End() { 833 return pseg.NextGap(), nil 834 } 835 pseg, _ = pseg.NextNonEmpty() 836 } 837 } 838 839 // getVecPMAInternalMappingsLocked ensures that pmas for all addresses in ars 840 // have cached internal mappings. It returns the subset of ars for which 841 // internal mappings exist. If this is not equal to ars, it returns a non-nil 842 // error explaining why. 843 // 844 // Preconditions: 845 // - mm.activeMu must be locked for writing. 846 // - pmas must exist for all addresses in ar. 847 // 848 // Postconditions: getVecPMAInternalMappingsLocked does not invalidate iterators 849 // into mm.pmas. 850 func (mm *MemoryManager) getVecPMAInternalMappingsLocked(ars hostarch.AddrRangeSeq) (hostarch.AddrRangeSeq, error) { 851 for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() { 852 ar := arsit.Head() 853 if ar.Length() == 0 { 854 continue 855 } 856 if pend, err := mm.getPMAInternalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); err != nil { 857 return truncatedAddrRangeSeq(ars, arsit, pend.Start()), err 858 } 859 } 860 return ars, nil 861 } 862 863 // internalMappingsLocked returns internal mappings for addresses in ar. 864 // 865 // Preconditions: 866 // - mm.activeMu must be locked. 867 // - Internal mappings must have been previously established for all addresses 868 // in ar. 869 // - ar.Length() != 0. 870 // - pseg.Range().Contains(ar.Start). 871 func (mm *MemoryManager) internalMappingsLocked(pseg pmaIterator, ar hostarch.AddrRange) safemem.BlockSeq { 872 if checkInvariants { 873 if !ar.WellFormed() || ar.Length() == 0 { 874 panic(fmt.Sprintf("invalid ar: %v", ar)) 875 } 876 if !pseg.Range().Contains(ar.Start) { 877 panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar)) 878 } 879 } 880 881 if ar.End <= pseg.End() { 882 // Since only one pma is involved, we can use pma.internalMappings 883 // directly, avoiding a slice allocation. 884 offset := uint64(ar.Start - pseg.Start()) 885 return pseg.ValuePtr().internalMappings.DropFirst64(offset).TakeFirst64(uint64(ar.Length())) 886 } 887 888 var ims []safemem.Block 889 for { 890 pr := pseg.Range().Intersect(ar) 891 for pims := pseg.ValuePtr().internalMappings.DropFirst64(uint64(pr.Start - pseg.Start())).TakeFirst64(uint64(pr.Length())); !pims.IsEmpty(); pims = pims.Tail() { 892 ims = append(ims, pims.Head()) 893 } 894 if ar.End <= pseg.End() { 895 break 896 } 897 pseg = pseg.NextSegment() 898 } 899 return safemem.BlockSeqFromSlice(ims) 900 } 901 902 // vecInternalMappingsLocked returns internal mappings for addresses in ars. 903 // 904 // Preconditions: 905 // - mm.activeMu must be locked. 906 // - Internal mappings must have been previously established for all addresses 907 // in ars. 908 func (mm *MemoryManager) vecInternalMappingsLocked(ars hostarch.AddrRangeSeq) safemem.BlockSeq { 909 var ims []safemem.Block 910 for ; !ars.IsEmpty(); ars = ars.Tail() { 911 ar := ars.Head() 912 if ar.Length() == 0 { 913 continue 914 } 915 for pims := mm.internalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); !pims.IsEmpty(); pims = pims.Tail() { 916 ims = append(ims, pims.Head()) 917 } 918 } 919 return safemem.BlockSeqFromSlice(ims) 920 } 921 922 // addRSSLocked updates the current and maximum resident set size of a 923 // MemoryManager to reflect the insertion of a pma at ar. 924 // 925 // Preconditions: mm.activeMu must be locked for writing. 926 func (mm *MemoryManager) addRSSLocked(ar hostarch.AddrRange) { 927 mm.curRSS += uint64(ar.Length()) 928 if mm.curRSS > mm.maxRSS { 929 mm.maxRSS = mm.curRSS 930 } 931 } 932 933 // removeRSSLocked updates the current resident set size of a MemoryManager to 934 // reflect the removal of a pma at ar. 935 // 936 // Preconditions: mm.activeMu must be locked for writing. 937 func (mm *MemoryManager) removeRSSLocked(ar hostarch.AddrRange) { 938 mm.curRSS -= uint64(ar.Length()) 939 } 940 941 // pmaSetFunctions implements segment.Functions for pmaSet. 942 type pmaSetFunctions struct{} 943 944 func (pmaSetFunctions) MinKey() hostarch.Addr { 945 return 0 946 } 947 948 func (pmaSetFunctions) MaxKey() hostarch.Addr { 949 return ^hostarch.Addr(0) 950 } 951 952 func (pmaSetFunctions) ClearValue(pma *pma) { 953 pma.file = nil 954 pma.internalMappings = safemem.BlockSeq{} 955 } 956 957 func (pmaSetFunctions) Merge(ar1 hostarch.AddrRange, pma1 pma, ar2 hostarch.AddrRange, pma2 pma) (pma, bool) { 958 if pma1.file != pma2.file || 959 pma1.off+uint64(ar1.Length()) != pma2.off || 960 pma1.translatePerms != pma2.translatePerms || 961 pma1.effectivePerms != pma2.effectivePerms || 962 pma1.maxPerms != pma2.maxPerms || 963 pma1.needCOW != pma2.needCOW || 964 pma1.private != pma2.private { 965 return pma{}, false 966 } 967 968 // Discard internal mappings instead of trying to merge them, since merging 969 // them requires an allocation and getting them again from the 970 // memmap.File might not. 971 pma1.internalMappings = safemem.BlockSeq{} 972 return pma1, true 973 } 974 975 func (pmaSetFunctions) Split(ar hostarch.AddrRange, p pma, split hostarch.Addr) (pma, pma) { 976 newlen1 := uint64(split - ar.Start) 977 p2 := p 978 p2.off += newlen1 979 if !p.internalMappings.IsEmpty() { 980 p.internalMappings = p.internalMappings.TakeFirst64(newlen1) 981 p2.internalMappings = p2.internalMappings.DropFirst64(newlen1) 982 } 983 return p, p2 984 } 985 986 // findOrSeekPrevUpperBoundPMA returns mm.pmas.UpperBoundSegment(addr), but may do 987 // so by scanning linearly backward from pgap. 988 // 989 // Preconditions: 990 // - mm.activeMu must be locked. 991 // - addr <= pgap.Start(). 992 func (mm *MemoryManager) findOrSeekPrevUpperBoundPMA(addr hostarch.Addr, pgap pmaGapIterator) pmaIterator { 993 if checkInvariants { 994 if !pgap.Ok() { 995 panic("terminal pma iterator") 996 } 997 if addr > pgap.Start() { 998 panic(fmt.Sprintf("can't seek backward to %#x from %#x", addr, pgap.Start())) 999 } 1000 } 1001 // Optimistically check if pgap.PrevSegment() is the PMA we're looking for, 1002 // which is the case if findOrSeekPrevUpperBoundPMA is called to find the 1003 // start of a range containing only a single PMA. 1004 if pseg := pgap.PrevSegment(); pseg.Start() <= addr { 1005 return pseg 1006 } 1007 return mm.pmas.UpperBoundSegment(addr) 1008 } 1009 1010 // getInternalMappingsLocked ensures that pseg.ValuePtr().internalMappings is 1011 // non-empty. 1012 // 1013 // Preconditions: mm.activeMu must be locked for writing. 1014 func (pseg pmaIterator) getInternalMappingsLocked() error { 1015 pma := pseg.ValuePtr() 1016 if pma.internalMappings.IsEmpty() { 1017 // This must use maxPerms (instead of perms) because some permission 1018 // constraints are only visible to vmas; for example, mappings of 1019 // read-only files have vma.maxPerms.Write unset, but this may not be 1020 // visible to the memmap.Mappable. 1021 perms := pma.maxPerms 1022 // We will never execute application code through an internal mapping. 1023 perms.Execute = false 1024 ims, err := pma.file.MapInternal(pseg.fileRange(), perms) 1025 if err != nil { 1026 return err 1027 } 1028 pma.internalMappings = ims 1029 } 1030 return nil 1031 } 1032 1033 func (pseg pmaIterator) fileRange() memmap.FileRange { 1034 return pseg.fileRangeOf(pseg.Range()) 1035 } 1036 1037 // Preconditions: 1038 // - pseg.Range().IsSupersetOf(ar). 1039 // - ar.Length != 0. 1040 func (pseg pmaIterator) fileRangeOf(ar hostarch.AddrRange) memmap.FileRange { 1041 if checkInvariants { 1042 if !pseg.Ok() { 1043 panic("terminal pma iterator") 1044 } 1045 if !ar.WellFormed() || ar.Length() == 0 { 1046 panic(fmt.Sprintf("invalid ar: %v", ar)) 1047 } 1048 if !pseg.Range().IsSupersetOf(ar) { 1049 panic(fmt.Sprintf("ar %v out of bounds %v", ar, pseg.Range())) 1050 } 1051 } 1052 1053 pma := pseg.ValuePtr() 1054 pstart := pseg.Start() 1055 return memmap.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)} 1056 }