github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/mm/pma.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package mm 16 17 import ( 18 "fmt" 19 "sync/atomic" 20 21 "github.com/MerlinKodo/gvisor/pkg/context" 22 "github.com/MerlinKodo/gvisor/pkg/errors/linuxerr" 23 "github.com/MerlinKodo/gvisor/pkg/hostarch" 24 "github.com/MerlinKodo/gvisor/pkg/safecopy" 25 "github.com/MerlinKodo/gvisor/pkg/safemem" 26 "github.com/MerlinKodo/gvisor/pkg/sentry/memmap" 27 "github.com/MerlinKodo/gvisor/pkg/sentry/pgalloc" 28 "github.com/MerlinKodo/gvisor/pkg/sentry/usage" 29 ) 30 31 // existingPMAsLocked checks that pmas exist for all addresses in ar, and 32 // support access of type (at, ignorePermissions). If so, it returns an 33 // iterator to the pma containing ar.Start. Otherwise it returns a terminal 34 // iterator. 35 // 36 // Preconditions: 37 // - mm.activeMu must be locked. 38 // - ar.Length() != 0. 39 func (mm *MemoryManager) existingPMAsLocked(ar hostarch.AddrRange, at hostarch.AccessType, ignorePermissions bool, needInternalMappings bool) pmaIterator { 40 if checkInvariants { 41 if !ar.WellFormed() || ar.Length() == 0 { 42 panic(fmt.Sprintf("invalid ar: %v", ar)) 43 } 44 } 45 46 first := mm.pmas.FindSegment(ar.Start) 47 pseg := first 48 for pseg.Ok() { 49 pma := pseg.ValuePtr() 50 perms := pma.effectivePerms 51 if ignorePermissions { 52 perms = pma.maxPerms 53 } 54 if !perms.SupersetOf(at) { 55 return pmaIterator{} 56 } 57 if needInternalMappings && pma.internalMappings.IsEmpty() { 58 return pmaIterator{} 59 } 60 61 if ar.End <= pseg.End() { 62 return first 63 } 64 pseg, _ = pseg.NextNonEmpty() 65 } 66 67 // Ran out of pmas before reaching ar.End. 68 return pmaIterator{} 69 } 70 71 // existingVecPMAsLocked returns true if pmas exist for all addresses in ars, 72 // and support access of type (at, ignorePermissions). 73 // 74 // Preconditions: mm.activeMu must be locked. 75 func (mm *MemoryManager) existingVecPMAsLocked(ars hostarch.AddrRangeSeq, at hostarch.AccessType, ignorePermissions bool, needInternalMappings bool) bool { 76 for ; !ars.IsEmpty(); ars = ars.Tail() { 77 if ar := ars.Head(); ar.Length() != 0 && !mm.existingPMAsLocked(ar, at, ignorePermissions, needInternalMappings).Ok() { 78 return false 79 } 80 } 81 return true 82 } 83 84 // getPMAsLocked ensures that pmas exist for all addresses in ar, and support 85 // access of type at. It returns: 86 // 87 // - An iterator to the pma containing ar.Start. If no pma contains ar.Start, 88 // the iterator is unspecified. 89 // 90 // - An iterator to the gap after the last pma containing an address in ar. If 91 // pmas exist for no addresses in ar, the iterator is to a gap that begins 92 // before ar.Start. 93 // 94 // - An error that is non-nil if pmas exist for only a subset of ar. 95 // 96 // Preconditions: 97 // - mm.mappingMu must be locked. 98 // - mm.activeMu must be locked for writing. 99 // - ar.Length() != 0. 100 // - vseg.Range().Contains(ar.Start). 101 // - vmas must exist for all addresses in ar, and support accesses of type at 102 // (i.e. permission checks must have been performed against vmas). 103 func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, at hostarch.AccessType) (pmaIterator, pmaGapIterator, error) { 104 if checkInvariants { 105 if !ar.WellFormed() || ar.Length() == 0 { 106 panic(fmt.Sprintf("invalid ar: %v", ar)) 107 } 108 if !vseg.Ok() { 109 panic("terminal vma iterator") 110 } 111 if !vseg.Range().Contains(ar.Start) { 112 panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar)) 113 } 114 } 115 116 // Page-align ar so that all AddrRanges are aligned. 117 end, ok := ar.End.RoundUp() 118 var alignerr error 119 if !ok { 120 end = ar.End.RoundDown() 121 alignerr = linuxerr.EFAULT 122 } 123 ar = hostarch.AddrRange{ar.Start.RoundDown(), end} 124 125 pstart, pend, perr := mm.getPMAsInternalLocked(ctx, vseg, ar, at) 126 if pend.Start() <= ar.Start { 127 return pmaIterator{}, pend, perr 128 } 129 // getPMAsInternalLocked may not have returned pstart due to iterator 130 // invalidation. 131 if !pstart.Ok() { 132 pstart = mm.findOrSeekPrevUpperBoundPMA(ar.Start, pend) 133 } 134 if perr != nil { 135 return pstart, pend, perr 136 } 137 return pstart, pend, alignerr 138 } 139 140 // getVecPMAsLocked ensures that pmas exist for all addresses in ars, and 141 // support access of type at. It returns the subset of ars for which pmas 142 // exist. If this is not equal to ars, it returns a non-nil error explaining 143 // why. 144 // 145 // Preconditions: 146 // - mm.mappingMu must be locked. 147 // - mm.activeMu must be locked for writing. 148 // - vmas must exist for all addresses in ars, and support accesses of type at 149 // (i.e. permission checks must have been performed against vmas). 150 func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars hostarch.AddrRangeSeq, at hostarch.AccessType) (hostarch.AddrRangeSeq, error) { 151 for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() { 152 ar := arsit.Head() 153 if ar.Length() == 0 { 154 continue 155 } 156 if checkInvariants { 157 if !ar.WellFormed() { 158 panic(fmt.Sprintf("invalid ar: %v", ar)) 159 } 160 } 161 162 // Page-align ar so that all AddrRanges are aligned. 163 end, ok := ar.End.RoundUp() 164 var alignerr error 165 if !ok { 166 end = ar.End.RoundDown() 167 alignerr = linuxerr.EFAULT 168 } 169 ar = hostarch.AddrRange{ar.Start.RoundDown(), end} 170 171 _, pend, perr := mm.getPMAsInternalLocked(ctx, mm.vmas.FindSegment(ar.Start), ar, at) 172 if perr != nil { 173 return truncatedAddrRangeSeq(ars, arsit, pend.Start()), perr 174 } 175 if alignerr != nil { 176 return truncatedAddrRangeSeq(ars, arsit, pend.Start()), alignerr 177 } 178 } 179 180 return ars, nil 181 } 182 183 // getPMAsInternalLocked is equivalent to getPMAsLocked, with the following 184 // exceptions: 185 // 186 // - getPMAsInternalLocked returns a pmaIterator on a best-effort basis (that 187 // is, the returned iterator may be terminal, even if a pma that contains 188 // ar.Start exists). Returning this iterator on a best-effort basis allows 189 // callers that require it to use it when it's cheaply available, while also 190 // avoiding the overhead of retrieving it when it's not. 191 // 192 // - getPMAsInternalLocked additionally requires that ar is page-aligned. 193 // getPMAsInternalLocked is an implementation helper for getPMAsLocked and 194 // getVecPMAsLocked; other clients should call one of those instead. 195 func (mm *MemoryManager) getPMAsInternalLocked(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, at hostarch.AccessType) (pmaIterator, pmaGapIterator, error) { 196 if checkInvariants { 197 if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { 198 panic(fmt.Sprintf("invalid ar: %v", ar)) 199 } 200 if !vseg.Ok() { 201 panic("terminal vma iterator") 202 } 203 if !vseg.Range().Contains(ar.Start) { 204 panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar)) 205 } 206 } 207 208 memCgID := pgalloc.MemoryCgroupIDFromContext(ctx) 209 opts := pgalloc.AllocOpts{Kind: usage.Anonymous, Dir: pgalloc.BottomUp, MemCgID: memCgID} 210 vma := vseg.ValuePtr() 211 if uintptr(ar.Start) < atomic.LoadUintptr(&vma.lastFault) { 212 // Detect cases where memory is accessed downwards and change memory file 213 // allocation order to increase the chances that pages are coalesced. 214 opts.Dir = pgalloc.TopDown 215 } 216 atomic.StoreUintptr(&vma.lastFault, uintptr(ar.Start)) 217 218 mf := mm.mfp.MemoryFile() 219 // Limit the range we allocate to ar, aligned to privateAllocUnit. 220 maskAR := privateAligned(ar) 221 didUnmapAS := false 222 // The range in which we iterate vmas and pmas is still limited to ar, to 223 // ensure that we don't allocate or COW-break a pma we don't need. 224 pseg, pgap := mm.pmas.Find(ar.Start) 225 pstart := pseg 226 for { 227 // Get pmas for this vma. 228 vsegAR := vseg.Range().Intersect(ar) 229 vma := vseg.ValuePtr() 230 pmaLoop: 231 for { 232 switch { 233 case pgap.Ok() && pgap.Start() < vsegAR.End: 234 // Need a pma here. 235 optAR := vseg.Range().Intersect(pgap.Range()) 236 if checkInvariants { 237 if optAR.Length() == 0 { 238 panic(fmt.Sprintf("vseg %v and pgap %v do not overlap", vseg, pgap)) 239 } 240 } 241 if vma.mappable == nil { 242 // Private anonymous mappings get pmas by allocating. 243 allocAR := optAR.Intersect(maskAR) 244 fr, err := mf.Allocate(uint64(allocAR.Length()), opts) 245 if err != nil { 246 return pstart, pgap, err 247 } 248 if checkInvariants { 249 if !fr.WellFormed() || fr.Length() != uint64(allocAR.Length()) { 250 panic(fmt.Sprintf("Allocate(%v) returned invalid FileRange %v", allocAR.Length(), fr)) 251 } 252 } 253 mm.addRSSLocked(allocAR) 254 mm.incPrivateRef(fr) 255 mf.IncRef(fr, memCgID) 256 pseg, pgap = mm.pmas.Insert(pgap, allocAR, pma{ 257 file: mf, 258 off: fr.Start, 259 translatePerms: hostarch.AnyAccess, 260 effectivePerms: vma.effectivePerms, 261 maxPerms: vma.maxPerms, 262 // Since we just allocated this memory and have the 263 // only reference, the new pma does not need 264 // copy-on-write. 265 private: true, 266 }).NextNonEmpty() 267 pstart = pmaIterator{} // iterators invalidated 268 } else { 269 // Other mappings get pmas by translating. 270 optMR := vseg.mappableRangeOf(optAR) 271 reqAR := optAR.Intersect(ar) 272 reqMR := vseg.mappableRangeOf(reqAR) 273 perms := at 274 if vma.private { 275 // This pma will be copy-on-write; don't require write 276 // permission, but do require read permission to 277 // facilitate the copy. 278 // 279 // If at.Write is true, we will need to break 280 // copy-on-write immediately, which occurs after 281 // translation below. 282 perms.Read = true 283 perms.Write = false 284 } 285 ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms) 286 if checkInvariants { 287 if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil { 288 panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err)) 289 } 290 } 291 // Install a pma for each translation. 292 if len(ts) == 0 { 293 return pstart, pgap, err 294 } 295 pstart = pmaIterator{} // iterators invalidated 296 for _, t := range ts { 297 newpmaAR := vseg.addrRangeOf(t.Source) 298 newpma := pma{ 299 file: t.File, 300 off: t.Offset, 301 translatePerms: t.Perms, 302 effectivePerms: vma.effectivePerms.Intersect(t.Perms), 303 maxPerms: vma.maxPerms.Intersect(t.Perms), 304 } 305 if vma.private { 306 newpma.effectivePerms.Write = false 307 newpma.maxPerms.Write = false 308 newpma.needCOW = true 309 } 310 mm.addRSSLocked(newpmaAR) 311 t.File.IncRef(t.FileRange(), memCgID) 312 // This is valid because memmap.Mappable.Translate is 313 // required to return Translations in increasing 314 // Translation.Source order. 315 pseg = mm.pmas.Insert(pgap, newpmaAR, newpma) 316 pgap = pseg.NextGap() 317 } 318 // The error returned by Translate is only significant if 319 // it occurred before ar.End. 320 if err != nil && vseg.addrRangeOf(ts[len(ts)-1].Source).End < ar.End { 321 return pstart, pgap, err 322 } 323 // Rewind pseg to the first pma inserted and continue the 324 // loop to check if we need to break copy-on-write. 325 pseg, pgap = mm.findOrSeekPrevUpperBoundPMA(vseg.addrRangeOf(ts[0].Source).Start, pgap), pmaGapIterator{} 326 continue 327 } 328 329 case pseg.Ok() && pseg.Start() < vsegAR.End: 330 oldpma := pseg.ValuePtr() 331 if at.Write && mm.isPMACopyOnWriteLocked(vseg, pseg) { 332 // Break copy-on-write by copying. 333 if checkInvariants { 334 if !oldpma.maxPerms.Read { 335 panic(fmt.Sprintf("pma %v needs to be copied for writing, but is not readable: %v", pseg.Range(), oldpma)) 336 } 337 } 338 var copyAR hostarch.AddrRange 339 if vma := vseg.ValuePtr(); vma.effectivePerms.Execute { 340 // The majority of copy-on-write breaks on executable 341 // pages come from: 342 // 343 // - The ELF loader, which must zero out bytes on the 344 // last page of each segment after the end of the 345 // segment. 346 // 347 // - gdb's use of ptrace to insert breakpoints. 348 // 349 // Neither of these cases has enough spatial locality 350 // to benefit from copying nearby pages, so if the vma 351 // is executable, only copy the pages required. 352 copyAR = pseg.Range().Intersect(ar) 353 } else if vma.growsDown { 354 // In most cases, the new process will not use most of 355 // its stack before exiting or invoking execve(); it is 356 // especially unlikely to return very far down its call 357 // stack, since async-signal-safety concerns in 358 // multithreaded programs prevent the new process from 359 // being able to do much. So only copy up to one page 360 // before and after the pages required. 361 stackMaskAR := ar 362 if newStart := stackMaskAR.Start - hostarch.PageSize; newStart < stackMaskAR.Start { 363 stackMaskAR.Start = newStart 364 } 365 if newEnd := stackMaskAR.End + hostarch.PageSize; newEnd > stackMaskAR.End { 366 stackMaskAR.End = newEnd 367 } 368 copyAR = pseg.Range().Intersect(stackMaskAR) 369 } else { 370 copyAR = pseg.Range().Intersect(maskAR) 371 } 372 // Get internal mappings from the pma to copy from. 373 if err := pseg.getInternalMappingsLocked(); err != nil { 374 return pstart, pseg.PrevGap(), err 375 } 376 // Copy contents. 377 fr, err := mf.Allocate(uint64(copyAR.Length()), pgalloc.AllocOpts{ 378 Kind: usage.Anonymous, 379 Mode: pgalloc.AllocateAndWritePopulate, 380 MemCgID: memCgID, 381 Reader: &safemem.BlockSeqReader{mm.internalMappingsLocked(pseg, copyAR)}, 382 }) 383 if _, ok := err.(safecopy.BusError); ok { 384 // If we got SIGBUS during the copy, deliver SIGBUS to 385 // userspace (instead of SIGSEGV) if we're breaking 386 // copy-on-write due to application page fault. 387 err = &memmap.BusError{err} 388 } 389 if fr.Length() == 0 { 390 return pstart, pseg.PrevGap(), err 391 } 392 // Unmap all of maskAR, not just copyAR, to minimize host 393 // syscalls. AddressSpace mappings must be removed before 394 // mm.decPrivateRef(). 395 if !didUnmapAS { 396 mm.unmapASLocked(maskAR) 397 didUnmapAS = true 398 } 399 // Replace the pma with a copy in the part of the address 400 // range where copying was successful. This doesn't change 401 // RSS. 402 copyAR.End = copyAR.Start + hostarch.Addr(fr.Length()) 403 if copyAR != pseg.Range() { 404 pseg = mm.pmas.Isolate(pseg, copyAR) 405 pstart = pmaIterator{} // iterators invalidated 406 } 407 oldpma = pseg.ValuePtr() 408 if oldpma.private { 409 mm.decPrivateRef(pseg.fileRange()) 410 } 411 oldpma.file.DecRef(pseg.fileRange()) 412 mm.incPrivateRef(fr) 413 mf.IncRef(fr, memCgID) 414 oldpma.file = mf 415 oldpma.off = fr.Start 416 oldpma.translatePerms = hostarch.AnyAccess 417 oldpma.effectivePerms = vma.effectivePerms 418 oldpma.maxPerms = vma.maxPerms 419 oldpma.needCOW = false 420 oldpma.private = true 421 oldpma.internalMappings = safemem.BlockSeq{} 422 // Try to merge the pma with its neighbors. 423 if prev := pseg.PrevSegment(); prev.Ok() { 424 if merged := mm.pmas.Merge(prev, pseg); merged.Ok() { 425 pseg = merged 426 pstart = pmaIterator{} // iterators invalidated 427 } 428 } 429 if next := pseg.NextSegment(); next.Ok() { 430 if merged := mm.pmas.Merge(pseg, next); merged.Ok() { 431 pseg = merged 432 pstart = pmaIterator{} // iterators invalidated 433 } 434 } 435 // The error returned by AllocateAndFill is only 436 // significant if it occurred before ar.End. 437 if err != nil && pseg.End() < ar.End { 438 return pstart, pseg.NextGap(), err 439 } 440 // Ensure pseg and pgap are correct for the next iteration 441 // of the loop. 442 pseg, pgap = pseg.NextNonEmpty() 443 } else if !oldpma.translatePerms.SupersetOf(at) { 444 // Get new pmas (with sufficient permissions) by calling 445 // memmap.Mappable.Translate again. 446 if checkInvariants { 447 if oldpma.private { 448 panic(fmt.Sprintf("private pma %v has non-maximal pma.translatePerms: %v", pseg.Range(), oldpma)) 449 } 450 } 451 // Allow the entire pma to be replaced. 452 optAR := pseg.Range() 453 optMR := vseg.mappableRangeOf(optAR) 454 reqAR := optAR.Intersect(ar) 455 reqMR := vseg.mappableRangeOf(reqAR) 456 perms := oldpma.translatePerms.Union(at) 457 ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms) 458 if checkInvariants { 459 if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil { 460 panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err)) 461 } 462 } 463 // Remove the part of the existing pma covered by new 464 // Translations, then insert new pmas. This doesn't change 465 // RSS. Note that we don't need to call unmapASLocked: any 466 // existing AddressSpace mappings are still valid (though 467 // less permissive than the new pmas indicate) until 468 // Invalidate is called, and will be replaced by future 469 // calls to mapASLocked. 470 if len(ts) == 0 { 471 return pstart, pseg.PrevGap(), err 472 } 473 transMR := memmap.MappableRange{ts[0].Source.Start, ts[len(ts)-1].Source.End} 474 transAR := vseg.addrRangeOf(transMR) 475 pseg = mm.pmas.Isolate(pseg, transAR) 476 pseg.ValuePtr().file.DecRef(pseg.fileRange()) 477 pgap = mm.pmas.Remove(pseg) 478 pstart = pmaIterator{} // iterators invalidated 479 for _, t := range ts { 480 newpmaAR := vseg.addrRangeOf(t.Source) 481 newpma := pma{ 482 file: t.File, 483 off: t.Offset, 484 translatePerms: t.Perms, 485 effectivePerms: vma.effectivePerms.Intersect(t.Perms), 486 maxPerms: vma.maxPerms.Intersect(t.Perms), 487 } 488 if vma.private { 489 newpma.effectivePerms.Write = false 490 newpma.maxPerms.Write = false 491 newpma.needCOW = true 492 } 493 t.File.IncRef(t.FileRange(), memCgID) 494 pseg = mm.pmas.Insert(pgap, newpmaAR, newpma) 495 pgap = pseg.NextGap() 496 } 497 // The error returned by Translate is only significant if 498 // it occurred before ar.End. 499 if err != nil && pseg.End() < ar.End { 500 return pstart, pgap, err 501 } 502 // Ensure pseg and pgap are correct for the next iteration 503 // of the loop. 504 if pgap.Range().Length() == 0 { 505 pseg, pgap = pgap.NextSegment(), pmaGapIterator{} 506 } else { 507 pseg = pmaIterator{} 508 } 509 } else { 510 // We have a usable pma; continue. 511 pseg, pgap = pseg.NextNonEmpty() 512 } 513 514 default: 515 break pmaLoop 516 } 517 } 518 // Go to the next vma. 519 if ar.End <= vseg.End() { 520 if pgap.Ok() { 521 return pstart, pgap, nil 522 } 523 return pstart, pseg.PrevGap(), nil 524 } 525 vseg = vseg.NextSegment() 526 } 527 } 528 529 const ( 530 // When memory is allocated for a private pma, align the allocated address 531 // range to a privateAllocUnit boundary when possible. Larger values of 532 // privateAllocUnit may reduce page faults by allowing fewer, larger pmas 533 // to be mapped, but may result in larger amounts of wasted memory in the 534 // presence of fragmentation. privateAllocUnit must be a power-of-2 535 // multiple of hostarch.PageSize. 536 privateAllocUnit = hostarch.HugePageSize 537 538 privateAllocMask = privateAllocUnit - 1 539 ) 540 541 func privateAligned(ar hostarch.AddrRange) hostarch.AddrRange { 542 aligned := hostarch.AddrRange{ar.Start &^ privateAllocMask, ar.End} 543 if end := (ar.End + privateAllocMask) &^ privateAllocMask; end >= ar.End { 544 aligned.End = end 545 } 546 if checkInvariants { 547 if !aligned.IsSupersetOf(ar) { 548 panic(fmt.Sprintf("aligned AddrRange %#v is not a superset of ar %#v", aligned, ar)) 549 } 550 } 551 return aligned 552 } 553 554 // isPMACopyOnWriteLocked returns true if the contents of the pma represented 555 // by pseg must be copied to a new private pma to be written to. 556 // 557 // If the pma is a copy-on-write private pma, and holds the only reference on 558 // the memory it maps, isPMACopyOnWriteLocked will take ownership of the memory 559 // and update the pma to indicate that it does not require copy-on-write. 560 // 561 // Preconditions: 562 // - vseg.Range().IsSupersetOf(pseg.Range()). 563 // - mm.mappingMu must be locked. 564 // - mm.activeMu must be locked for writing. 565 func (mm *MemoryManager) isPMACopyOnWriteLocked(vseg vmaIterator, pseg pmaIterator) bool { 566 pma := pseg.ValuePtr() 567 if !pma.needCOW { 568 return false 569 } 570 if !pma.private { 571 return true 572 } 573 // If we have the only reference on private memory to be copied, just take 574 // ownership of it instead of copying. If we do hold the only reference, 575 // additional references can only be taken by mm.Fork(), which is excluded 576 // by mm.activeMu, so this isn't racy. 577 mm.privateRefs.mu.Lock() 578 defer mm.privateRefs.mu.Unlock() 579 fr := pseg.fileRange() 580 // This check relies on mm.privateRefs.refs being kept fully merged. 581 rseg := mm.privateRefs.refs.FindSegment(fr.Start) 582 if rseg.Ok() && rseg.Value() == 1 && fr.End <= rseg.End() { 583 pma.needCOW = false 584 // pma.private => pma.translatePerms == hostarch.AnyAccess 585 vma := vseg.ValuePtr() 586 pma.effectivePerms = vma.effectivePerms 587 pma.maxPerms = vma.maxPerms 588 return false 589 } 590 return true 591 } 592 593 // Invalidate implements memmap.MappingSpace.Invalidate. 594 func (mm *MemoryManager) Invalidate(ar hostarch.AddrRange, opts memmap.InvalidateOpts) { 595 if checkInvariants { 596 if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { 597 panic(fmt.Sprintf("invalid ar: %v", ar)) 598 } 599 } 600 601 mm.activeMu.Lock() 602 defer mm.activeMu.Unlock() 603 if mm.captureInvalidations { 604 mm.capturedInvalidations = append(mm.capturedInvalidations, invalidateArgs{ar, opts}) 605 return 606 } 607 mm.invalidateLocked(ar, opts.InvalidatePrivate, true) 608 } 609 610 // invalidateLocked removes pmas and AddressSpace mappings of those pmas for 611 // addresses in ar. 612 // 613 // Preconditions: 614 // - mm.activeMu must be locked for writing. 615 // - ar.Length() != 0. 616 // - ar must be page-aligned. 617 func (mm *MemoryManager) invalidateLocked(ar hostarch.AddrRange, invalidatePrivate, invalidateShared bool) { 618 if checkInvariants { 619 if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { 620 panic(fmt.Sprintf("invalid ar: %v", ar)) 621 } 622 } 623 624 var didUnmapAS bool 625 pseg := mm.pmas.LowerBoundSegment(ar.Start) 626 for pseg.Ok() && pseg.Start() < ar.End { 627 pma := pseg.ValuePtr() 628 if (invalidatePrivate && pma.private) || (invalidateShared && !pma.private) { 629 pseg = mm.pmas.Isolate(pseg, ar) 630 pma = pseg.ValuePtr() 631 if !didUnmapAS { 632 // Unmap all of ar, not just pseg.Range(), to minimize host 633 // syscalls. AddressSpace mappings must be removed before 634 // mm.decPrivateRef(). 635 // 636 // Note that we do more than just ar here, and extrapolate 637 // to the end of any previous region that we may have mapped. 638 // This is done to ensure that lower layers can fully invalidate 639 // intermediate pagetable pages during the unmap. 640 var unmapAR hostarch.AddrRange 641 if prev := pseg.PrevSegment(); prev.Ok() { 642 unmapAR.Start = prev.End() 643 } else { 644 unmapAR.Start = mm.layout.MinAddr 645 } 646 if last := mm.pmas.LowerBoundSegment(ar.End); last.Ok() { 647 if last.Start() < ar.End { 648 unmapAR.End = ar.End 649 } else { 650 unmapAR.End = last.Start() 651 } 652 } else { 653 unmapAR.End = mm.layout.MaxAddr 654 } 655 mm.unmapASLocked(unmapAR) 656 didUnmapAS = true 657 } 658 if pma.private { 659 mm.decPrivateRef(pseg.fileRange()) 660 } 661 mm.removeRSSLocked(pseg.Range()) 662 pma.file.DecRef(pseg.fileRange()) 663 pseg = mm.pmas.Remove(pseg).NextSegment() 664 } else { 665 pseg = pseg.NextSegment() 666 } 667 } 668 } 669 670 // Pin returns the memmap.File ranges currently mapped by addresses in ar in 671 // mm, acquiring a reference on the returned ranges which the caller must 672 // release by calling Unpin. If not all addresses are mapped, Pin returns a 673 // non-nil error. Note that Pin may return both a non-empty slice of 674 // PinnedRanges and a non-nil error. 675 // 676 // Pin does not prevent mapped ranges from changing, making it unsuitable for 677 // most I/O. It should only be used in contexts that would use get_user_pages() 678 // in the Linux kernel. 679 // 680 // Preconditions: 681 // - ar.Length() != 0. 682 // - ar must be page-aligned. 683 func (mm *MemoryManager) Pin(ctx context.Context, ar hostarch.AddrRange, at hostarch.AccessType, ignorePermissions bool) ([]PinnedRange, error) { 684 if checkInvariants { 685 if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { 686 panic(fmt.Sprintf("invalid ar: %v", ar)) 687 } 688 } 689 690 // Ensure that we have usable vmas. 691 mm.mappingMu.RLock() 692 vseg, vend, verr := mm.getVMAsLocked(ctx, ar, at, ignorePermissions) 693 if vendaddr := vend.Start(); vendaddr < ar.End { 694 if vendaddr <= ar.Start { 695 mm.mappingMu.RUnlock() 696 return nil, verr 697 } 698 ar.End = vendaddr 699 } 700 701 // Ensure that we have usable pmas. 702 mm.activeMu.Lock() 703 pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, at) 704 mm.mappingMu.RUnlock() 705 if pendaddr := pend.Start(); pendaddr < ar.End { 706 if pendaddr <= ar.Start { 707 mm.activeMu.Unlock() 708 return nil, perr 709 } 710 ar.End = pendaddr 711 } 712 713 memCgID := pgalloc.MemoryCgroupIDFromContext(ctx) 714 // Gather pmas. 715 var prs []PinnedRange 716 for pseg.Ok() && pseg.Start() < ar.End { 717 psar := pseg.Range().Intersect(ar) 718 f := pseg.ValuePtr().file 719 fr := pseg.fileRangeOf(psar) 720 f.IncRef(fr, memCgID) 721 prs = append(prs, PinnedRange{ 722 Source: psar, 723 File: f, 724 Offset: fr.Start, 725 }) 726 pseg = pseg.NextSegment() 727 } 728 mm.activeMu.Unlock() 729 730 // Return the first error in order of progress through ar. 731 if perr != nil { 732 return prs, perr 733 } 734 return prs, verr 735 } 736 737 // PinnedRanges are returned by MemoryManager.Pin. 738 type PinnedRange struct { 739 // Source is the corresponding range of addresses. 740 Source hostarch.AddrRange 741 742 // File is the mapped file. 743 File memmap.File 744 745 // Offset is the offset into File at which this PinnedRange begins. 746 Offset uint64 747 } 748 749 // FileRange returns the memmap.File offsets mapped by pr. 750 func (pr PinnedRange) FileRange() memmap.FileRange { 751 return memmap.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())} 752 } 753 754 // Unpin releases the reference held by prs. 755 func Unpin(prs []PinnedRange) { 756 for i := range prs { 757 prs[i].File.DecRef(prs[i].FileRange()) 758 } 759 } 760 761 // movePMAsLocked moves all pmas in oldAR to newAR. 762 // 763 // Preconditions: 764 // - mm.activeMu must be locked for writing. 765 // - oldAR.Length() != 0. 766 // - oldAR.Length() <= newAR.Length(). 767 // - !oldAR.Overlaps(newAR). 768 // - mm.pmas.IsEmptyRange(newAR). 769 // - oldAR and newAR must be page-aligned. 770 func (mm *MemoryManager) movePMAsLocked(oldAR, newAR hostarch.AddrRange) { 771 if checkInvariants { 772 if !oldAR.WellFormed() || oldAR.Length() == 0 || !oldAR.IsPageAligned() { 773 panic(fmt.Sprintf("invalid oldAR: %v", oldAR)) 774 } 775 if !newAR.WellFormed() || newAR.Length() == 0 || !newAR.IsPageAligned() { 776 panic(fmt.Sprintf("invalid newAR: %v", newAR)) 777 } 778 if oldAR.Length() > newAR.Length() { 779 panic(fmt.Sprintf("old address range %v may contain pmas that will not fit in new address range %v", oldAR, newAR)) 780 } 781 if oldAR.Overlaps(newAR) { 782 panic(fmt.Sprintf("old and new address ranges overlap: %v, %v", oldAR, newAR)) 783 } 784 // mm.pmas.IsEmptyRange is checked by mm.pmas.Insert. 785 } 786 787 type movedPMA struct { 788 oldAR hostarch.AddrRange 789 pma pma 790 } 791 var movedPMAs []movedPMA 792 pseg := mm.pmas.LowerBoundSegment(oldAR.Start) 793 for pseg.Ok() && pseg.Start() < oldAR.End { 794 pseg = mm.pmas.Isolate(pseg, oldAR) 795 movedPMAs = append(movedPMAs, movedPMA{ 796 oldAR: pseg.Range(), 797 pma: pseg.Value(), 798 }) 799 pseg = mm.pmas.Remove(pseg).NextSegment() 800 // No RSS change is needed since we're re-inserting the same pmas 801 // below. 802 } 803 804 off := newAR.Start - oldAR.Start 805 pgap := mm.pmas.FindGap(newAR.Start) 806 for i := range movedPMAs { 807 mpma := &movedPMAs[i] 808 pmaNewAR := hostarch.AddrRange{mpma.oldAR.Start + off, mpma.oldAR.End + off} 809 pgap = mm.pmas.Insert(pgap, pmaNewAR, mpma.pma).NextGap() 810 } 811 812 mm.unmapASLocked(oldAR) 813 } 814 815 // getPMAInternalMappingsLocked ensures that pmas for all addresses in ar have 816 // cached internal mappings. It returns: 817 // 818 // - An iterator to the gap after the last pma with internal mappings 819 // containing an address in ar. If internal mappings exist for no addresses in 820 // ar, the iterator is to a gap that begins before ar.Start. 821 // 822 // - An error that is non-nil if internal mappings exist for only a subset of 823 // ar. 824 // 825 // Preconditions: 826 // - mm.activeMu must be locked for writing. 827 // - pseg.Range().Contains(ar.Start). 828 // - pmas must exist for all addresses in ar. 829 // - ar.Length() != 0. 830 // 831 // Postconditions: getPMAInternalMappingsLocked does not invalidate iterators 832 // into mm.pmas. 833 func (mm *MemoryManager) getPMAInternalMappingsLocked(pseg pmaIterator, ar hostarch.AddrRange) (pmaGapIterator, error) { 834 if checkInvariants { 835 if !ar.WellFormed() || ar.Length() == 0 { 836 panic(fmt.Sprintf("invalid ar: %v", ar)) 837 } 838 if !pseg.Range().Contains(ar.Start) { 839 panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar)) 840 } 841 } 842 843 for { 844 if err := pseg.getInternalMappingsLocked(); err != nil { 845 return pseg.PrevGap(), err 846 } 847 if ar.End <= pseg.End() { 848 return pseg.NextGap(), nil 849 } 850 pseg, _ = pseg.NextNonEmpty() 851 } 852 } 853 854 // getVecPMAInternalMappingsLocked ensures that pmas for all addresses in ars 855 // have cached internal mappings. It returns the subset of ars for which 856 // internal mappings exist. If this is not equal to ars, it returns a non-nil 857 // error explaining why. 858 // 859 // Preconditions: 860 // - mm.activeMu must be locked for writing. 861 // - pmas must exist for all addresses in ar. 862 // 863 // Postconditions: getVecPMAInternalMappingsLocked does not invalidate iterators 864 // into mm.pmas. 865 func (mm *MemoryManager) getVecPMAInternalMappingsLocked(ars hostarch.AddrRangeSeq) (hostarch.AddrRangeSeq, error) { 866 for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() { 867 ar := arsit.Head() 868 if ar.Length() == 0 { 869 continue 870 } 871 if pend, err := mm.getPMAInternalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); err != nil { 872 return truncatedAddrRangeSeq(ars, arsit, pend.Start()), err 873 } 874 } 875 return ars, nil 876 } 877 878 // internalMappingsLocked returns internal mappings for addresses in ar. 879 // 880 // Preconditions: 881 // - mm.activeMu must be locked. 882 // - Internal mappings must have been previously established for all addresses 883 // in ar. 884 // - ar.Length() != 0. 885 // - pseg.Range().Contains(ar.Start). 886 func (mm *MemoryManager) internalMappingsLocked(pseg pmaIterator, ar hostarch.AddrRange) safemem.BlockSeq { 887 if checkInvariants { 888 if !ar.WellFormed() || ar.Length() == 0 { 889 panic(fmt.Sprintf("invalid ar: %v", ar)) 890 } 891 if !pseg.Range().Contains(ar.Start) { 892 panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar)) 893 } 894 } 895 896 if ar.End <= pseg.End() { 897 // Since only one pma is involved, we can use pma.internalMappings 898 // directly, avoiding a slice allocation. 899 offset := uint64(ar.Start - pseg.Start()) 900 return pseg.ValuePtr().internalMappings.DropFirst64(offset).TakeFirst64(uint64(ar.Length())) 901 } 902 903 var ims []safemem.Block 904 for { 905 pr := pseg.Range().Intersect(ar) 906 for pims := pseg.ValuePtr().internalMappings.DropFirst64(uint64(pr.Start - pseg.Start())).TakeFirst64(uint64(pr.Length())); !pims.IsEmpty(); pims = pims.Tail() { 907 ims = append(ims, pims.Head()) 908 } 909 if ar.End <= pseg.End() { 910 break 911 } 912 pseg = pseg.NextSegment() 913 } 914 return safemem.BlockSeqFromSlice(ims) 915 } 916 917 // vecInternalMappingsLocked returns internal mappings for addresses in ars. 918 // 919 // Preconditions: 920 // - mm.activeMu must be locked. 921 // - Internal mappings must have been previously established for all addresses 922 // in ars. 923 func (mm *MemoryManager) vecInternalMappingsLocked(ars hostarch.AddrRangeSeq) safemem.BlockSeq { 924 var ims []safemem.Block 925 for ; !ars.IsEmpty(); ars = ars.Tail() { 926 ar := ars.Head() 927 if ar.Length() == 0 { 928 continue 929 } 930 for pims := mm.internalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); !pims.IsEmpty(); pims = pims.Tail() { 931 ims = append(ims, pims.Head()) 932 } 933 } 934 return safemem.BlockSeqFromSlice(ims) 935 } 936 937 // incPrivateRef acquires a reference on private pages in fr. 938 func (mm *MemoryManager) incPrivateRef(fr memmap.FileRange) { 939 mm.privateRefs.mu.Lock() 940 defer mm.privateRefs.mu.Unlock() 941 refSet := &mm.privateRefs.refs 942 seg, gap := refSet.Find(fr.Start) 943 for { 944 switch { 945 case seg.Ok() && seg.Start() < fr.End: 946 seg = refSet.Isolate(seg, fr) 947 seg.SetValue(seg.Value() + 1) 948 seg, gap = seg.NextNonEmpty() 949 case gap.Ok() && gap.Start() < fr.End: 950 seg, gap = refSet.InsertWithoutMerging(gap, gap.Range().Intersect(fr), 1).NextNonEmpty() 951 default: 952 refSet.MergeAdjacent(fr) 953 return 954 } 955 } 956 } 957 958 // decPrivateRef releases a reference on private pages in fr. 959 func (mm *MemoryManager) decPrivateRef(fr memmap.FileRange) { 960 var freed []memmap.FileRange 961 962 mm.privateRefs.mu.Lock() 963 refSet := &mm.privateRefs.refs 964 seg := refSet.LowerBoundSegment(fr.Start) 965 for seg.Ok() && seg.Start() < fr.End { 966 seg = refSet.Isolate(seg, fr) 967 if old := seg.Value(); old == 1 { 968 freed = append(freed, seg.Range()) 969 seg = refSet.Remove(seg).NextSegment() 970 } else { 971 seg.SetValue(old - 1) 972 seg = seg.NextSegment() 973 } 974 } 975 refSet.MergeAdjacent(fr) 976 mm.privateRefs.mu.Unlock() 977 978 mf := mm.mfp.MemoryFile() 979 for _, fr := range freed { 980 mf.DecRef(fr) 981 } 982 } 983 984 // addRSSLocked updates the current and maximum resident set size of a 985 // MemoryManager to reflect the insertion of a pma at ar. 986 // 987 // Preconditions: mm.activeMu must be locked for writing. 988 func (mm *MemoryManager) addRSSLocked(ar hostarch.AddrRange) { 989 mm.curRSS += uint64(ar.Length()) 990 if mm.curRSS > mm.maxRSS { 991 mm.maxRSS = mm.curRSS 992 } 993 } 994 995 // removeRSSLocked updates the current resident set size of a MemoryManager to 996 // reflect the removal of a pma at ar. 997 // 998 // Preconditions: mm.activeMu must be locked for writing. 999 func (mm *MemoryManager) removeRSSLocked(ar hostarch.AddrRange) { 1000 mm.curRSS -= uint64(ar.Length()) 1001 } 1002 1003 // pmaSetFunctions implements segment.Functions for pmaSet. 1004 type pmaSetFunctions struct{} 1005 1006 func (pmaSetFunctions) MinKey() hostarch.Addr { 1007 return 0 1008 } 1009 1010 func (pmaSetFunctions) MaxKey() hostarch.Addr { 1011 return ^hostarch.Addr(0) 1012 } 1013 1014 func (pmaSetFunctions) ClearValue(pma *pma) { 1015 pma.file = nil 1016 pma.internalMappings = safemem.BlockSeq{} 1017 } 1018 1019 func (pmaSetFunctions) Merge(ar1 hostarch.AddrRange, pma1 pma, ar2 hostarch.AddrRange, pma2 pma) (pma, bool) { 1020 if pma1.file != pma2.file || 1021 pma1.off+uint64(ar1.Length()) != pma2.off || 1022 pma1.translatePerms != pma2.translatePerms || 1023 pma1.effectivePerms != pma2.effectivePerms || 1024 pma1.maxPerms != pma2.maxPerms || 1025 pma1.needCOW != pma2.needCOW || 1026 pma1.private != pma2.private { 1027 return pma{}, false 1028 } 1029 1030 // Discard internal mappings instead of trying to merge them, since merging 1031 // them requires an allocation and getting them again from the 1032 // memmap.File might not. 1033 pma1.internalMappings = safemem.BlockSeq{} 1034 return pma1, true 1035 } 1036 1037 func (pmaSetFunctions) Split(ar hostarch.AddrRange, p pma, split hostarch.Addr) (pma, pma) { 1038 newlen1 := uint64(split - ar.Start) 1039 p2 := p 1040 p2.off += newlen1 1041 if !p.internalMappings.IsEmpty() { 1042 p.internalMappings = p.internalMappings.TakeFirst64(newlen1) 1043 p2.internalMappings = p2.internalMappings.DropFirst64(newlen1) 1044 } 1045 return p, p2 1046 } 1047 1048 // findOrSeekPrevUpperBoundPMA returns mm.pmas.UpperBoundSegment(addr), but may do 1049 // so by scanning linearly backward from pgap. 1050 // 1051 // Preconditions: 1052 // - mm.activeMu must be locked. 1053 // - addr <= pgap.Start(). 1054 func (mm *MemoryManager) findOrSeekPrevUpperBoundPMA(addr hostarch.Addr, pgap pmaGapIterator) pmaIterator { 1055 if checkInvariants { 1056 if !pgap.Ok() { 1057 panic("terminal pma iterator") 1058 } 1059 if addr > pgap.Start() { 1060 panic(fmt.Sprintf("can't seek backward to %#x from %#x", addr, pgap.Start())) 1061 } 1062 } 1063 // Optimistically check if pgap.PrevSegment() is the PMA we're looking for, 1064 // which is the case if findOrSeekPrevUpperBoundPMA is called to find the 1065 // start of a range containing only a single PMA. 1066 if pseg := pgap.PrevSegment(); pseg.Start() <= addr { 1067 return pseg 1068 } 1069 return mm.pmas.UpperBoundSegment(addr) 1070 } 1071 1072 // getInternalMappingsLocked ensures that pseg.ValuePtr().internalMappings is 1073 // non-empty. 1074 // 1075 // Preconditions: mm.activeMu must be locked for writing. 1076 func (pseg pmaIterator) getInternalMappingsLocked() error { 1077 pma := pseg.ValuePtr() 1078 if pma.internalMappings.IsEmpty() { 1079 // This must use maxPerms (instead of perms) because some permission 1080 // constraints are only visible to vmas; for example, mappings of 1081 // read-only files have vma.maxPerms.Write unset, but this may not be 1082 // visible to the memmap.Mappable. 1083 perms := pma.maxPerms 1084 // We will never execute application code through an internal mapping. 1085 perms.Execute = false 1086 ims, err := pma.file.MapInternal(pseg.fileRange(), perms) 1087 if err != nil { 1088 return err 1089 } 1090 pma.internalMappings = ims 1091 } 1092 return nil 1093 } 1094 1095 func (pseg pmaIterator) fileRange() memmap.FileRange { 1096 return pseg.fileRangeOf(pseg.Range()) 1097 } 1098 1099 // Preconditions: 1100 // - pseg.Range().IsSupersetOf(ar). 1101 // - ar.Length != 0. 1102 func (pseg pmaIterator) fileRangeOf(ar hostarch.AddrRange) memmap.FileRange { 1103 if checkInvariants { 1104 if !pseg.Ok() { 1105 panic("terminal pma iterator") 1106 } 1107 if !ar.WellFormed() || ar.Length() == 0 { 1108 panic(fmt.Sprintf("invalid ar: %v", ar)) 1109 } 1110 if !pseg.Range().IsSupersetOf(ar) { 1111 panic(fmt.Sprintf("ar %v out of bounds %v", ar, pseg.Range())) 1112 } 1113 } 1114 1115 pma := pseg.ValuePtr() 1116 pstart := pseg.Start() 1117 return memmap.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)} 1118 }