github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/mm/vma.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package mm 16 17 import ( 18 "fmt" 19 20 "github.com/SagerNet/gvisor/pkg/abi/linux" 21 "github.com/SagerNet/gvisor/pkg/context" 22 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 23 "github.com/SagerNet/gvisor/pkg/hostarch" 24 "github.com/SagerNet/gvisor/pkg/sentry/arch" 25 "github.com/SagerNet/gvisor/pkg/sentry/kernel/auth" 26 "github.com/SagerNet/gvisor/pkg/sentry/limits" 27 "github.com/SagerNet/gvisor/pkg/sentry/memmap" 28 "github.com/SagerNet/gvisor/pkg/syserror" 29 ) 30 31 // Preconditions: 32 // * mm.mappingMu must be locked for writing. 33 // * opts must be valid as defined by the checks in MMap. 34 func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOpts) (vmaIterator, hostarch.AddrRange, error) { 35 if opts.MaxPerms != opts.MaxPerms.Effective() { 36 panic(fmt.Sprintf("Non-effective MaxPerms %s cannot be enforced", opts.MaxPerms)) 37 } 38 39 // Find a usable range. 40 addr, err := mm.findAvailableLocked(opts.Length, findAvailableOpts{ 41 Addr: opts.Addr, 42 Fixed: opts.Fixed, 43 Unmap: opts.Unmap, 44 Map32Bit: opts.Map32Bit, 45 }) 46 if err != nil { 47 // Can't force without opts.Unmap and opts.Fixed. 48 if opts.Force && opts.Unmap && opts.Fixed { 49 addr = opts.Addr 50 } else { 51 return vmaIterator{}, hostarch.AddrRange{}, err 52 } 53 } 54 ar, _ := addr.ToRange(opts.Length) 55 56 // Check against RLIMIT_AS. 57 newUsageAS := mm.usageAS + opts.Length 58 if opts.Unmap { 59 newUsageAS -= uint64(mm.vmas.SpanRange(ar)) 60 } 61 if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS { 62 return vmaIterator{}, hostarch.AddrRange{}, syserror.ENOMEM 63 } 64 65 if opts.MLockMode != memmap.MLockNone { 66 // Check against RLIMIT_MEMLOCK. 67 if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { 68 mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur 69 if mlockLimit == 0 { 70 return vmaIterator{}, hostarch.AddrRange{}, linuxerr.EPERM 71 } 72 newLockedAS := mm.lockedAS + opts.Length 73 if opts.Unmap { 74 newLockedAS -= mm.mlockedBytesRangeLocked(ar) 75 } 76 if newLockedAS > mlockLimit { 77 return vmaIterator{}, hostarch.AddrRange{}, linuxerr.EAGAIN 78 } 79 } 80 } 81 82 // Remove overwritten mappings. This ordering is consistent with Linux: 83 // compare Linux's mm/mmap.c:mmap_region() => do_munmap(), 84 // file->f_op->mmap(). 85 var vgap vmaGapIterator 86 if opts.Unmap { 87 vgap = mm.unmapLocked(ctx, ar) 88 } else { 89 vgap = mm.vmas.FindGap(ar.Start) 90 } 91 92 // Inform the Mappable, if any, of the new mapping. 93 if opts.Mappable != nil { 94 // The expression for writable is vma.canWriteMappableLocked(), but we 95 // don't yet have a vma. 96 if err := opts.Mappable.AddMapping(ctx, mm, ar, opts.Offset, !opts.Private && opts.MaxPerms.Write); err != nil { 97 return vmaIterator{}, hostarch.AddrRange{}, err 98 } 99 } 100 101 // Take a reference on opts.MappingIdentity before inserting the vma since 102 // vma merging can drop the reference. 103 if opts.MappingIdentity != nil { 104 opts.MappingIdentity.IncRef() 105 } 106 107 // Finally insert the vma. 108 v := vma{ 109 mappable: opts.Mappable, 110 off: opts.Offset, 111 realPerms: opts.Perms, 112 effectivePerms: opts.Perms.Effective(), 113 maxPerms: opts.MaxPerms, 114 private: opts.Private, 115 growsDown: opts.GrowsDown, 116 mlockMode: opts.MLockMode, 117 numaPolicy: linux.MPOL_DEFAULT, 118 id: opts.MappingIdentity, 119 hint: opts.Hint, 120 } 121 122 vseg := mm.vmas.Insert(vgap, ar, v) 123 mm.usageAS += opts.Length 124 if v.isPrivateDataLocked() { 125 mm.dataAS += opts.Length 126 } 127 if opts.MLockMode != memmap.MLockNone { 128 mm.lockedAS += opts.Length 129 } 130 131 return vseg, ar, nil 132 } 133 134 type findAvailableOpts struct { 135 // These fields are equivalent to those in memmap.MMapOpts, except that: 136 // 137 // - Addr must be page-aligned. 138 // 139 // - Unmap allows existing guard pages in the returned range. 140 141 Addr hostarch.Addr 142 Fixed bool 143 Unmap bool 144 Map32Bit bool 145 } 146 147 // map32Start/End are the bounds to which MAP_32BIT mappings are constrained, 148 // and are equivalent to Linux's MAP32_BASE and MAP32_MAX respectively. 149 const ( 150 map32Start = 0x40000000 151 map32End = 0x80000000 152 ) 153 154 // findAvailableLocked finds an allocatable range. 155 // 156 // Preconditions: mm.mappingMu must be locked. 157 func (mm *MemoryManager) findAvailableLocked(length uint64, opts findAvailableOpts) (hostarch.Addr, error) { 158 if opts.Fixed { 159 opts.Map32Bit = false 160 } 161 allowedAR := mm.applicationAddrRange() 162 if opts.Map32Bit { 163 allowedAR = allowedAR.Intersect(hostarch.AddrRange{map32Start, map32End}) 164 } 165 166 // Does the provided suggestion work? 167 if ar, ok := opts.Addr.ToRange(length); ok { 168 if allowedAR.IsSupersetOf(ar) { 169 if opts.Unmap { 170 return ar.Start, nil 171 } 172 // Check for the presence of an existing vma or guard page. 173 if vgap := mm.vmas.FindGap(ar.Start); vgap.Ok() && vgap.availableRange().IsSupersetOf(ar) { 174 return ar.Start, nil 175 } 176 } 177 } 178 179 // Fixed mappings accept only the requested address. 180 if opts.Fixed { 181 return 0, syserror.ENOMEM 182 } 183 184 // Prefer hugepage alignment if a hugepage or more is requested. 185 alignment := uint64(hostarch.PageSize) 186 if length >= hostarch.HugePageSize { 187 alignment = hostarch.HugePageSize 188 } 189 190 if opts.Map32Bit { 191 return mm.findLowestAvailableLocked(length, alignment, allowedAR) 192 } 193 if mm.layout.DefaultDirection == arch.MmapBottomUp { 194 return mm.findLowestAvailableLocked(length, alignment, hostarch.AddrRange{mm.layout.BottomUpBase, mm.layout.MaxAddr}) 195 } 196 return mm.findHighestAvailableLocked(length, alignment, hostarch.AddrRange{mm.layout.MinAddr, mm.layout.TopDownBase}) 197 } 198 199 func (mm *MemoryManager) applicationAddrRange() hostarch.AddrRange { 200 return hostarch.AddrRange{mm.layout.MinAddr, mm.layout.MaxAddr} 201 } 202 203 // Preconditions: mm.mappingMu must be locked. 204 func (mm *MemoryManager) findLowestAvailableLocked(length, alignment uint64, bounds hostarch.AddrRange) (hostarch.Addr, error) { 205 for gap := mm.vmas.LowerBoundGap(bounds.Start); gap.Ok() && gap.Start() < bounds.End; gap = gap.NextLargeEnoughGap(hostarch.Addr(length)) { 206 if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length { 207 // Can we shift up to match the alignment? 208 if offset := uint64(gr.Start) % alignment; offset != 0 { 209 if uint64(gr.Length()) >= length+alignment-offset { 210 // Yes, we're aligned. 211 return gr.Start + hostarch.Addr(alignment-offset), nil 212 } 213 } 214 215 // Either aligned perfectly, or can't align it. 216 return gr.Start, nil 217 } 218 } 219 return 0, syserror.ENOMEM 220 } 221 222 // Preconditions: mm.mappingMu must be locked. 223 func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bounds hostarch.AddrRange) (hostarch.Addr, error) { 224 for gap := mm.vmas.UpperBoundGap(bounds.End); gap.Ok() && gap.End() > bounds.Start; gap = gap.PrevLargeEnoughGap(hostarch.Addr(length)) { 225 if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length { 226 // Can we shift down to match the alignment? 227 start := gr.End - hostarch.Addr(length) 228 if offset := uint64(start) % alignment; offset != 0 { 229 if gr.Start <= start-hostarch.Addr(offset) { 230 // Yes, we're aligned. 231 return start - hostarch.Addr(offset), nil 232 } 233 } 234 235 // Either aligned perfectly, or can't align it. 236 return start, nil 237 } 238 } 239 return 0, syserror.ENOMEM 240 } 241 242 // Preconditions: mm.mappingMu must be locked. 243 func (mm *MemoryManager) mlockedBytesRangeLocked(ar hostarch.AddrRange) uint64 { 244 var total uint64 245 for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { 246 if vseg.ValuePtr().mlockMode != memmap.MLockNone { 247 total += uint64(vseg.Range().Intersect(ar).Length()) 248 } 249 } 250 return total 251 } 252 253 // getVMAsLocked ensures that vmas exist for all addresses in ar, and support 254 // access of type (at, ignorePermissions). It returns: 255 // 256 // - An iterator to the vma containing ar.Start. If no vma contains ar.Start, 257 // the iterator is unspecified. 258 // 259 // - An iterator to the gap after the last vma containing an address in ar. If 260 // vmas exist for no addresses in ar, the iterator is to a gap that begins 261 // before ar.Start. 262 // 263 // - An error that is non-nil if vmas exist for only a subset of ar. 264 // 265 // Preconditions: 266 // * mm.mappingMu must be locked for reading; it may be temporarily unlocked. 267 // * ar.Length() != 0. 268 func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar hostarch.AddrRange, at hostarch.AccessType, ignorePermissions bool) (vmaIterator, vmaGapIterator, error) { 269 if checkInvariants { 270 if !ar.WellFormed() || ar.Length() == 0 { 271 panic(fmt.Sprintf("invalid ar: %v", ar)) 272 } 273 } 274 275 // Inline mm.vmas.LowerBoundSegment so that we have the preceding gap if 276 // !vbegin.Ok(). 277 vbegin, vgap := mm.vmas.Find(ar.Start) 278 if !vbegin.Ok() { 279 vbegin = vgap.NextSegment() 280 // vseg.Ok() is checked before entering the following loop. 281 } else { 282 vgap = vbegin.PrevGap() 283 } 284 285 addr := ar.Start 286 vseg := vbegin 287 for vseg.Ok() { 288 // Loop invariants: vgap = vseg.PrevGap(); addr < vseg.End(). 289 vma := vseg.ValuePtr() 290 if addr < vseg.Start() { 291 // TODO(jamieliu): Implement vma.growsDown here. 292 return vbegin, vgap, syserror.EFAULT 293 } 294 295 perms := vma.effectivePerms 296 if ignorePermissions { 297 perms = vma.maxPerms 298 } 299 if !perms.SupersetOf(at) { 300 return vbegin, vgap, linuxerr.EPERM 301 } 302 303 addr = vseg.End() 304 vgap = vseg.NextGap() 305 if addr >= ar.End { 306 return vbegin, vgap, nil 307 } 308 vseg = vgap.NextSegment() 309 } 310 311 // Ran out of vmas before ar.End. 312 return vbegin, vgap, syserror.EFAULT 313 } 314 315 // getVecVMAsLocked ensures that vmas exist for all addresses in ars, and 316 // support access to type of (at, ignorePermissions). It returns the subset of 317 // ars for which vmas exist. If this is not equal to ars, it returns a non-nil 318 // error explaining why. 319 // 320 // Preconditions: mm.mappingMu must be locked for reading; it may be 321 // temporarily unlocked. 322 // 323 // Postconditions: ars is not mutated. 324 func (mm *MemoryManager) getVecVMAsLocked(ctx context.Context, ars hostarch.AddrRangeSeq, at hostarch.AccessType, ignorePermissions bool) (hostarch.AddrRangeSeq, error) { 325 for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() { 326 ar := arsit.Head() 327 if ar.Length() == 0 { 328 continue 329 } 330 if _, vend, err := mm.getVMAsLocked(ctx, ar, at, ignorePermissions); err != nil { 331 return truncatedAddrRangeSeq(ars, arsit, vend.Start()), err 332 } 333 } 334 return ars, nil 335 } 336 337 // vma extension will not shrink the number of unmapped bytes between the start 338 // of a growsDown vma and the end of its predecessor non-growsDown vma below 339 // guardBytes. 340 // 341 // guardBytes is equivalent to Linux's stack_guard_gap after upstream 342 // 1be7107fbe18 "mm: larger stack guard gap, between vmas". 343 const guardBytes = 256 * hostarch.PageSize 344 345 // unmapLocked unmaps all addresses in ar and returns the resulting gap in 346 // mm.vmas. 347 // 348 // Preconditions: 349 // * mm.mappingMu must be locked for writing. 350 // * ar.Length() != 0. 351 // * ar must be page-aligned. 352 func (mm *MemoryManager) unmapLocked(ctx context.Context, ar hostarch.AddrRange) vmaGapIterator { 353 if checkInvariants { 354 if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { 355 panic(fmt.Sprintf("invalid ar: %v", ar)) 356 } 357 } 358 359 // AddressSpace mappings and pmas must be invalidated before 360 // mm.removeVMAsLocked() => memmap.Mappable.RemoveMapping(). 361 mm.Invalidate(ar, memmap.InvalidateOpts{InvalidatePrivate: true}) 362 return mm.removeVMAsLocked(ctx, ar) 363 } 364 365 // removeVMAsLocked removes vmas for addresses in ar and returns the resulting 366 // gap in mm.vmas. It does not remove pmas or AddressSpace mappings; clients 367 // must do so before calling removeVMAsLocked. 368 // 369 // Preconditions: 370 // * mm.mappingMu must be locked for writing. 371 // * ar.Length() != 0. 372 // * ar must be page-aligned. 373 func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar hostarch.AddrRange) vmaGapIterator { 374 if checkInvariants { 375 if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { 376 panic(fmt.Sprintf("invalid ar: %v", ar)) 377 } 378 } 379 380 vseg, vgap := mm.vmas.Find(ar.Start) 381 if vgap.Ok() { 382 vseg = vgap.NextSegment() 383 } 384 for vseg.Ok() && vseg.Start() < ar.End { 385 vseg = mm.vmas.Isolate(vseg, ar) 386 vmaAR := vseg.Range() 387 vma := vseg.ValuePtr() 388 if vma.mappable != nil { 389 vma.mappable.RemoveMapping(ctx, mm, vmaAR, vma.off, vma.canWriteMappableLocked()) 390 } 391 if vma.id != nil { 392 vma.id.DecRef(ctx) 393 } 394 mm.usageAS -= uint64(vmaAR.Length()) 395 if vma.isPrivateDataLocked() { 396 mm.dataAS -= uint64(vmaAR.Length()) 397 } 398 if vma.mlockMode != memmap.MLockNone { 399 mm.lockedAS -= uint64(vmaAR.Length()) 400 } 401 vgap = mm.vmas.Remove(vseg) 402 vseg = vgap.NextSegment() 403 } 404 return vgap 405 } 406 407 // canWriteMappableLocked returns true if it is possible for vma.mappable to be 408 // written to via this vma, i.e. if it is possible that 409 // vma.mappable.Translate(at.Write=true) may be called as a result of this vma. 410 // This includes via I/O with usermem.IOOpts.IgnorePermissions = true, such as 411 // PTRACE_POKEDATA. 412 // 413 // canWriteMappableLocked is equivalent to Linux's VM_SHARED. 414 // 415 // Preconditions: mm.mappingMu must be locked. 416 func (vma *vma) canWriteMappableLocked() bool { 417 return !vma.private && vma.maxPerms.Write 418 } 419 420 // isPrivateDataLocked identify the data segments - private, writable, not stack 421 // 422 // Preconditions: mm.mappingMu must be locked. 423 func (vma *vma) isPrivateDataLocked() bool { 424 return vma.realPerms.Write && vma.private && !vma.growsDown 425 } 426 427 // vmaSetFunctions implements segment.Functions for vmaSet. 428 type vmaSetFunctions struct{} 429 430 func (vmaSetFunctions) MinKey() hostarch.Addr { 431 return 0 432 } 433 434 func (vmaSetFunctions) MaxKey() hostarch.Addr { 435 return ^hostarch.Addr(0) 436 } 437 438 func (vmaSetFunctions) ClearValue(vma *vma) { 439 vma.mappable = nil 440 vma.id = nil 441 vma.hint = "" 442 } 443 444 func (vmaSetFunctions) Merge(ar1 hostarch.AddrRange, vma1 vma, ar2 hostarch.AddrRange, vma2 vma) (vma, bool) { 445 if vma1.mappable != vma2.mappable || 446 (vma1.mappable != nil && vma1.off+uint64(ar1.Length()) != vma2.off) || 447 vma1.realPerms != vma2.realPerms || 448 vma1.maxPerms != vma2.maxPerms || 449 vma1.private != vma2.private || 450 vma1.growsDown != vma2.growsDown || 451 vma1.mlockMode != vma2.mlockMode || 452 vma1.numaPolicy != vma2.numaPolicy || 453 vma1.numaNodemask != vma2.numaNodemask || 454 vma1.dontfork != vma2.dontfork || 455 vma1.id != vma2.id || 456 vma1.hint != vma2.hint { 457 return vma{}, false 458 } 459 460 if vma2.id != nil { 461 vma2.id.DecRef(context.Background()) 462 } 463 return vma1, true 464 } 465 466 func (vmaSetFunctions) Split(ar hostarch.AddrRange, v vma, split hostarch.Addr) (vma, vma) { 467 v2 := v 468 if v2.mappable != nil { 469 v2.off += uint64(split - ar.Start) 470 } 471 if v2.id != nil { 472 v2.id.IncRef() 473 } 474 return v, v2 475 } 476 477 // Preconditions: 478 // * vseg.ValuePtr().mappable != nil. 479 // * vseg.Range().Contains(addr). 480 func (vseg vmaIterator) mappableOffsetAt(addr hostarch.Addr) uint64 { 481 if checkInvariants { 482 if !vseg.Ok() { 483 panic("terminal vma iterator") 484 } 485 if vseg.ValuePtr().mappable == nil { 486 panic("Mappable offset is meaningless for anonymous vma") 487 } 488 if !vseg.Range().Contains(addr) { 489 panic(fmt.Sprintf("addr %v out of bounds %v", addr, vseg.Range())) 490 } 491 } 492 493 vma := vseg.ValuePtr() 494 vstart := vseg.Start() 495 return vma.off + uint64(addr-vstart) 496 } 497 498 // Preconditions: vseg.ValuePtr().mappable != nil. 499 func (vseg vmaIterator) mappableRange() memmap.MappableRange { 500 return vseg.mappableRangeOf(vseg.Range()) 501 } 502 503 // Preconditions: 504 // * vseg.ValuePtr().mappable != nil. 505 // * vseg.Range().IsSupersetOf(ar). 506 // * ar.Length() != 0. 507 func (vseg vmaIterator) mappableRangeOf(ar hostarch.AddrRange) memmap.MappableRange { 508 if checkInvariants { 509 if !vseg.Ok() { 510 panic("terminal vma iterator") 511 } 512 if vseg.ValuePtr().mappable == nil { 513 panic("MappableRange is meaningless for anonymous vma") 514 } 515 if !ar.WellFormed() || ar.Length() == 0 { 516 panic(fmt.Sprintf("invalid ar: %v", ar)) 517 } 518 if !vseg.Range().IsSupersetOf(ar) { 519 panic(fmt.Sprintf("ar %v out of bounds %v", ar, vseg.Range())) 520 } 521 } 522 523 vma := vseg.ValuePtr() 524 vstart := vseg.Start() 525 return memmap.MappableRange{vma.off + uint64(ar.Start-vstart), vma.off + uint64(ar.End-vstart)} 526 } 527 528 // Preconditions: 529 // * vseg.ValuePtr().mappable != nil. 530 // * vseg.mappableRange().IsSupersetOf(mr). 531 // * mr.Length() != 0. 532 func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) hostarch.AddrRange { 533 if checkInvariants { 534 if !vseg.Ok() { 535 panic("terminal vma iterator") 536 } 537 if vseg.ValuePtr().mappable == nil { 538 panic("MappableRange is meaningless for anonymous vma") 539 } 540 if !mr.WellFormed() || mr.Length() == 0 { 541 panic(fmt.Sprintf("invalid mr: %v", mr)) 542 } 543 if !vseg.mappableRange().IsSupersetOf(mr) { 544 panic(fmt.Sprintf("mr %v out of bounds %v", mr, vseg.mappableRange())) 545 } 546 } 547 548 vma := vseg.ValuePtr() 549 vstart := vseg.Start() 550 return hostarch.AddrRange{vstart + hostarch.Addr(mr.Start-vma.off), vstart + hostarch.Addr(mr.End-vma.off)} 551 } 552 553 // seekNextLowerBound returns mm.vmas.LowerBoundSegment(addr), but does so by 554 // scanning linearly forward from vseg. 555 // 556 // Preconditions: 557 // * mm.mappingMu must be locked. 558 // * addr >= vseg.Start(). 559 func (vseg vmaIterator) seekNextLowerBound(addr hostarch.Addr) vmaIterator { 560 if checkInvariants { 561 if !vseg.Ok() { 562 panic("terminal vma iterator") 563 } 564 if addr < vseg.Start() { 565 panic(fmt.Sprintf("can't seek forward to %#x from %#x", addr, vseg.Start())) 566 } 567 } 568 for vseg.Ok() && addr >= vseg.End() { 569 vseg = vseg.NextSegment() 570 } 571 return vseg 572 } 573 574 // availableRange returns the subset of vgap.Range() in which new vmas may be 575 // created without MMapOpts.Unmap == true. 576 func (vgap vmaGapIterator) availableRange() hostarch.AddrRange { 577 ar := vgap.Range() 578 next := vgap.NextSegment() 579 if !next.Ok() || !next.ValuePtr().growsDown { 580 return ar 581 } 582 // Exclude guard pages. 583 if ar.Length() < guardBytes { 584 return hostarch.AddrRange{ar.Start, ar.Start} 585 } 586 ar.End -= guardBytes 587 return ar 588 }