github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/pgalloc/pgalloc.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package pgalloc contains the page allocator subsystem, which manages memory 16 // that may be mapped into application address spaces. 17 // 18 // Lock order: 19 // 20 // pgalloc.MemoryFile.mu 21 // pgalloc.MemoryFile.mappingsMu 22 package pgalloc 23 24 import ( 25 "fmt" 26 "math" 27 "os" 28 "sync/atomic" 29 "time" 30 31 "golang.org/x/sys/unix" 32 "github.com/metacubex/gvisor/pkg/abi/linux" 33 "github.com/metacubex/gvisor/pkg/atomicbitops" 34 "github.com/metacubex/gvisor/pkg/context" 35 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 36 "github.com/metacubex/gvisor/pkg/hostarch" 37 "github.com/metacubex/gvisor/pkg/log" 38 "github.com/metacubex/gvisor/pkg/safemem" 39 "github.com/metacubex/gvisor/pkg/sentry/hostmm" 40 "github.com/metacubex/gvisor/pkg/sentry/memmap" 41 "github.com/metacubex/gvisor/pkg/sentry/usage" 42 "github.com/metacubex/gvisor/pkg/sync" 43 ) 44 45 // Direction describes how to allocate offsets from MemoryFile. 46 type Direction int 47 48 const ( 49 // BottomUp allocates offsets in increasing offsets. 50 BottomUp Direction = iota 51 // TopDown allocates offsets in decreasing offsets. 52 TopDown 53 ) 54 55 // String implements fmt.Stringer. 56 func (d Direction) String() string { 57 switch d { 58 case BottomUp: 59 return "up" 60 case TopDown: 61 return "down" 62 } 63 panic(fmt.Sprintf("invalid direction: %d", d)) 64 } 65 66 // MemoryFile is a memmap.File whose pages may be allocated to arbitrary 67 // users. 68 type MemoryFile struct { 69 // opts holds options passed to NewMemoryFile. opts is immutable. 70 opts MemoryFileOpts 71 72 // MemoryFile owns a single backing file, which is modeled as follows: 73 // 74 // Each page in the file can be committed or uncommitted. A page is 75 // committed if the host kernel is spending resources to store its contents 76 // and uncommitted otherwise. This definition includes pages that the host 77 // kernel has swapped; this is intentional, to ensure that accounting does 78 // not change even if host kernel swapping behavior changes, and that 79 // memory used by pseudo-swap mechanisms like zswap is still accounted. 80 // 81 // The initial contents of uncommitted pages are implicitly zero bytes. A 82 // read or write to the contents of an uncommitted page causes it to be 83 // committed. This is the only event that can cause a uncommitted page to 84 // be committed. 85 // 86 // fallocate(FALLOC_FL_PUNCH_HOLE) (MemoryFile.Decommit) causes committed 87 // pages to be uncommitted. This is the only event that can cause a 88 // committed page to be uncommitted. 89 // 90 // Memory accounting is based on identifying the set of committed pages. 91 // Since we do not have direct access to the MMU, tracking reads and writes 92 // to uncommitted pages to detect commitment would introduce additional 93 // page faults, which would be prohibitively expensive. Instead, we query 94 // the host kernel to determine which pages are committed. 95 96 // file is the backing file. The file pointer is immutable. 97 file *os.File 98 99 mu memoryFileMutex 100 101 // usage maps each page in the file to metadata for that page. Pages for 102 // which no segment exists in usage are both unallocated (not in use) and 103 // uncommitted. 104 // 105 // Since usage stores usageInfo objects by value, clients should usually 106 // use usageIterator.ValuePtr() instead of usageIterator.Value() to get a 107 // pointer to the usageInfo rather than a copy. 108 // 109 // usage must be kept maximally merged (that is, there should never be two 110 // adjacent segments with the same values). At least markReclaimed depends 111 // on this property. 112 // 113 // usage is protected by mu. 114 usage usageSet 115 116 // The UpdateUsage function scans all segments with knownCommitted set 117 // to false, sees which pages are committed and creates corresponding 118 // segments with knownCommitted set to true. 119 // 120 // In order to avoid unnecessary scans, usageExpected tracks the total 121 // file blocks expected. This is used to elide the scan when this 122 // matches the underlying file blocks. 123 // 124 // To track swapped pages, usageSwapped tracks the discrepancy between 125 // what is observed in core and what is reported by the file. When 126 // usageSwapped is non-zero, a sweep will be performed at least every 127 // second. The start of the last sweep is recorded in usageLast. 128 // 129 // All usage attributes are all protected by mu. 130 usageExpected uint64 131 usageSwapped uint64 132 usageLast time.Time 133 134 // fileSize is the size of the backing memory file in bytes. fileSize is 135 // always a power-of-two multiple of chunkSize. 136 // 137 // fileSize is protected by mu. 138 fileSize int64 139 140 // Pages from the backing file are mapped into the local address space on 141 // the granularity of large pieces called chunks. mappings is a []uintptr 142 // that stores, for each chunk, the start address of a mapping of that 143 // chunk in the current process' address space, or 0 if no such mapping 144 // exists. Once a chunk is mapped, it is never remapped or unmapped until 145 // the MemoryFile is destroyed. 146 // 147 // Mutating the mappings slice or its contents requires both holding 148 // mappingsMu and using atomic memory operations. (The slice is mutated 149 // whenever the file is expanded. Per the above, the only permitted 150 // mutation of the slice's contents is the assignment of a mapping to a 151 // chunk that was previously unmapped.) Reading the slice or its contents 152 // only requires *either* holding mappingsMu or using atomic memory 153 // operations. This allows MemoryFile.MapInternal to avoid locking in the 154 // common case where chunk mappings already exist. 155 mappingsMu mappingsMutex 156 mappings atomic.Pointer[[]uintptr] 157 158 // destroyed is set by Destroy to instruct the reclaimer goroutine to 159 // release resources and exit. destroyed is protected by mu. 160 destroyed bool 161 162 // reclaimable is true if usage may contain reclaimable pages. reclaimable 163 // is protected by mu. 164 reclaimable bool 165 166 // reclaim is the collection of regions for reclaim. reclaim is protected 167 // by mu. 168 reclaim reclaimSet 169 170 // reclaimCond is signaled (with mu locked) when reclaimable or destroyed 171 // transitions from false to true. 172 reclaimCond sync.Cond 173 174 // evictable maps EvictableMemoryUsers to eviction state. 175 // 176 // evictable is protected by mu. 177 evictable map[EvictableMemoryUser]*evictableMemoryUserInfo 178 179 // evictionWG counts the number of goroutines currently performing evictions. 180 evictionWG sync.WaitGroup 181 182 // stopNotifyPressure stops memory cgroup pressure level 183 // notifications used to drive eviction. stopNotifyPressure is 184 // immutable. 185 stopNotifyPressure func() 186 187 // savable is true if this MemoryFile will be saved via SaveTo() during 188 // the kernel's SaveTo operation. savable is protected by mu. 189 savable bool 190 } 191 192 // MemoryFileOpts provides options to NewMemoryFile. 193 type MemoryFileOpts struct { 194 // DelayedEviction controls the extent to which the MemoryFile may delay 195 // eviction of evictable allocations. 196 DelayedEviction DelayedEvictionType 197 198 // If UseHostMemcgPressure is true, use host memory cgroup pressure level 199 // notifications to determine when eviction is necessary. This option has 200 // no effect unless DelayedEviction is DelayedEvictionEnabled. 201 UseHostMemcgPressure bool 202 203 // DecommitOnDestroy indicates whether the entire host file should be 204 // decommitted on destruction. This is appropriate for host filesystem based 205 // files that need to be explicitly cleaned up to release disk space. 206 DecommitOnDestroy bool 207 208 // If ManualZeroing is true, MemoryFile must not assume that new pages 209 // obtained from the host are zero-filled, such that MemoryFile must manually 210 // zero newly-allocated pages. 211 ManualZeroing bool 212 213 // If DisableIMAWorkAround is true, NewMemoryFile will not call 214 // IMAWorkAroundForMemFile(). 215 DisableIMAWorkAround bool 216 217 // DiskBackedFile indicates that the MemoryFile is backed by a file on disk. 218 DiskBackedFile bool 219 220 // RestoreID is an opaque string used to reassociate the MemoryFile with its 221 // replacement during restore. 222 RestoreID string 223 } 224 225 // DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction. 226 type DelayedEvictionType int 227 228 const ( 229 // DelayedEvictionDefault has unspecified behavior. 230 DelayedEvictionDefault DelayedEvictionType = iota 231 232 // DelayedEvictionDisabled requires that evictable allocations are evicted 233 // as soon as possible. 234 DelayedEvictionDisabled 235 236 // DelayedEvictionEnabled requests that the MemoryFile delay eviction of 237 // evictable allocations until doing so is considered necessary to avoid 238 // performance degradation due to host memory pressure, or OOM kills. 239 // 240 // As of this writing, the behavior of DelayedEvictionEnabled depends on 241 // whether or not MemoryFileOpts.UseHostMemcgPressure is enabled: 242 // 243 // - If UseHostMemcgPressure is true, evictions are delayed until memory 244 // pressure is indicated. 245 // 246 // - Otherwise, evictions are only delayed until the reclaimer goroutine 247 // is out of work (pages to reclaim). 248 DelayedEvictionEnabled 249 250 // DelayedEvictionManual requires that evictable allocations are only 251 // evicted when MemoryFile.StartEvictions() is called. This is extremely 252 // dangerous outside of tests. 253 DelayedEvictionManual 254 ) 255 256 // usageInfo tracks usage information. 257 // 258 // +stateify savable 259 type usageInfo struct { 260 // kind is the usage kind. 261 kind usage.MemoryKind 262 263 // knownCommitted is true if the tracked region is definitely committed. 264 // (If it is false, the tracked region may or may not be committed.) 265 knownCommitted bool 266 267 refs uint64 268 269 // memCgID is the memory cgroup id to which this page is committed. 270 memCgID uint32 271 } 272 273 // canCommit returns true if the tracked region can be committed. 274 func (u *usageInfo) canCommit() bool { 275 // refs must be greater than 0 because we assume that reclaimable pages 276 // (that aren't already known to be committed) are not committed. This 277 // isn't necessarily true, even after the reclaimer does Decommit(), 278 // because the kernel may subsequently back the hugepage-sized region 279 // containing the decommitted page with a hugepage. However, it's 280 // consistent with our treatment of unallocated pages, which have the same 281 // property. 282 return !u.knownCommitted && u.refs != 0 283 } 284 285 // An EvictableMemoryUser represents a user of MemoryFile-allocated memory that 286 // may be asked to deallocate that memory in the presence of memory pressure. 287 type EvictableMemoryUser interface { 288 // Evict requests that the EvictableMemoryUser deallocate memory used by 289 // er, which was registered as evictable by a previous call to 290 // MemoryFile.MarkEvictable. 291 // 292 // Evict is not required to deallocate memory. In particular, since pgalloc 293 // must call Evict without holding locks to avoid circular lock ordering, 294 // it is possible that the passed range has already been marked as 295 // unevictable by a racing call to MemoryFile.MarkUnevictable. 296 // Implementations of EvictableMemoryUser must detect such races and handle 297 // them by making Evict have no effect on unevictable ranges. 298 // 299 // After a call to Evict, the MemoryFile will consider the evicted range 300 // unevictable (i.e. it will not call Evict on the same range again) until 301 // informed otherwise by a subsequent call to MarkEvictable. 302 Evict(ctx context.Context, er EvictableRange) 303 } 304 305 // An EvictableRange represents a range of uint64 offsets in an 306 // EvictableMemoryUser. 307 // 308 // In practice, most EvictableMemoryUsers will probably be implementations of 309 // memmap.Mappable, and EvictableRange therefore corresponds to 310 // memmap.MappableRange. However, this package cannot depend on the memmap 311 // package, since doing so would create a circular dependency. 312 // 313 // type EvictableRange <generated using go_generics> 314 315 // evictableMemoryUserInfo is the value type of MemoryFile.evictable. 316 type evictableMemoryUserInfo struct { 317 // ranges tracks all evictable ranges for the given user. 318 ranges evictableRangeSet 319 320 // If evicting is true, there is a goroutine currently evicting all 321 // evictable ranges for this user. 322 evicting bool 323 } 324 325 const ( 326 chunkShift = 30 327 chunkSize = 1 << chunkShift // 1 GB 328 chunkMask = chunkSize - 1 329 330 // maxPage is the highest 64-bit page. 331 maxPage = math.MaxUint64 &^ (hostarch.PageSize - 1) 332 ) 333 334 // NewMemoryFile creates a MemoryFile backed by the given file. If 335 // NewMemoryFile succeeds, ownership of file is transferred to the returned 336 // MemoryFile. 337 func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) { 338 switch opts.DelayedEviction { 339 case DelayedEvictionDefault: 340 opts.DelayedEviction = DelayedEvictionEnabled 341 case DelayedEvictionDisabled, DelayedEvictionManual: 342 opts.UseHostMemcgPressure = false 343 case DelayedEvictionEnabled: 344 // ok 345 default: 346 return nil, fmt.Errorf("invalid MemoryFileOpts.DelayedEviction: %v", opts.DelayedEviction) 347 } 348 349 // Truncate the file to 0 bytes first to ensure that it's empty. 350 if err := file.Truncate(0); err != nil { 351 return nil, err 352 } 353 f := &MemoryFile{ 354 opts: opts, 355 file: file, 356 evictable: make(map[EvictableMemoryUser]*evictableMemoryUserInfo), 357 } 358 f.mappings.Store(&[]uintptr{}) 359 f.reclaimCond.L = &f.mu 360 361 if f.opts.DelayedEviction == DelayedEvictionEnabled && f.opts.UseHostMemcgPressure { 362 stop, err := hostmm.NotifyCurrentMemcgPressureCallback(func() { 363 f.mu.Lock() 364 startedAny := f.startEvictionsLocked() 365 f.mu.Unlock() 366 if startedAny { 367 log.Debugf("pgalloc.MemoryFile performing evictions due to memcg pressure") 368 } 369 }, "low") 370 if err != nil { 371 return nil, fmt.Errorf("failed to configure memcg pressure level notifications: %v", err) 372 } 373 f.stopNotifyPressure = stop 374 } 375 376 go f.runReclaim() // S/R-SAFE: f.mu 377 378 if !opts.DisableIMAWorkAround { 379 IMAWorkAroundForMemFile(file.Fd()) 380 } 381 return f, nil 382 } 383 384 // IMAWorkAroundForMemFile works around IMA by immediately creating a temporary 385 // PROT_EXEC mapping, while the backing file is still small. IMA will ignore 386 // any future mappings. 387 // 388 // The Linux kernel contains an optional feature called "Integrity 389 // Measurement Architecture" (IMA). If IMA is enabled, it will checksum 390 // binaries the first time they are mapped PROT_EXEC. This is bad news for 391 // executable pages mapped from our backing file, which can grow to 392 // terabytes in (sparse) size. If IMA attempts to checksum a file that 393 // large, it will allocate all of the sparse pages and quickly exhaust all 394 // memory. 395 func IMAWorkAroundForMemFile(fd uintptr) { 396 m, _, errno := unix.Syscall6( 397 unix.SYS_MMAP, 398 0, 399 hostarch.PageSize, 400 unix.PROT_EXEC, 401 unix.MAP_SHARED, 402 fd, 403 0) 404 if errno != 0 { 405 // This isn't fatal (IMA may not even be in use). Log the error, but 406 // don't return it. 407 log.Warningf("Failed to pre-map MemoryFile PROT_EXEC: %v", errno) 408 } else { 409 if _, _, errno := unix.Syscall( 410 unix.SYS_MUNMAP, 411 m, 412 hostarch.PageSize, 413 0); errno != 0 { 414 panic(fmt.Sprintf("failed to unmap PROT_EXEC MemoryFile mapping: %v", errno)) 415 } 416 } 417 } 418 419 // Destroy releases all resources used by f. 420 // 421 // Preconditions: All pages allocated by f have been freed. 422 // 423 // Postconditions: None of f's methods may be called after Destroy. 424 func (f *MemoryFile) Destroy() { 425 f.mu.Lock() 426 defer f.mu.Unlock() 427 f.destroyed = true 428 f.reclaimCond.Signal() 429 } 430 431 // AllocationMode provides a way to inform the pgalloc API how to allocate 432 // memory and pages on the host. 433 // A page will exist in one of the following incremental states: 434 // 1. Allocated: A page is allocated if it was returned by Allocate() and its 435 // reference count hasn't dropped to 0 since then. 436 // 2. Committed: As described in MemoryFile documentation above, a page is 437 // committed if the host kernel is spending resources to store its 438 // contents. A committed page is implicitly allocated. 439 // 3. Populated: A page is populated for reading/writing in a page table 440 // hierarchy if it has a page table entry that permits reading/writing 441 // respectively. A populated page is implicitly committed, since the page 442 // table entry needs a physical page to point to, but not vice versa. 443 type AllocationMode int 444 445 const ( 446 // AllocateOnly indicates that pages need to only be allocated. 447 AllocateOnly AllocationMode = iota 448 // AllocateAndCommit indicates that pages need to be committed, in addition 449 // to being allocated. 450 AllocateAndCommit 451 // AllocateAndWritePopulate indicates that writable pages should ideally be 452 // populated in the page table, in addition to being allocated. This is a 453 // suggestion, not a requirement. 454 AllocateAndWritePopulate 455 ) 456 457 // AllocOpts are options used in MemoryFile.Allocate. 458 type AllocOpts struct { 459 // Kind is the memory kind to be used for accounting. 460 Kind usage.MemoryKind 461 // Dir indicates the direction in which offsets are allocated. 462 Dir Direction 463 // MemCgID is the memory cgroup ID and the zero value indicates that 464 // the memory will not be accounted to any cgroup. 465 MemCgID uint32 466 // Mode allows the callers to select how the pages are allocated in the 467 // MemoryFile. Callers that will fill the allocated memory by writing to it 468 // should pass AllocateAndWritePopulate to avoid faulting page-by-page. Callers 469 // that will fill the allocated memory by invoking host system calls should 470 // pass AllocateOnly. 471 Mode AllocationMode 472 // If ReaderFunc is provided, the allocated memory is filled by calling it 473 // repeatedly until either length bytes are read or a non-nil error is 474 // returned. It returns the allocated memory, truncated down to the nearest 475 // page. If this is shorter than length bytes due to an error returned by 476 // ReaderFunc, it returns the partially filled fr and error. 477 ReaderFunc safemem.ReaderFunc 478 } 479 480 // Allocate returns a range of initially-zeroed pages of the given length with 481 // the given accounting kind and a single reference held by the caller. When 482 // the last reference on an allocated page is released, ownership of the page 483 // is returned to the MemoryFile, allowing it to be returned by a future call 484 // to Allocate. 485 // 486 // Preconditions: length must be page-aligned and non-zero. 487 func (f *MemoryFile) Allocate(length uint64, opts AllocOpts) (memmap.FileRange, error) { 488 fr, err := f.allocate(length, &opts) 489 if err != nil { 490 return memmap.FileRange{}, err 491 } 492 var dsts safemem.BlockSeq 493 switch opts.Mode { 494 case AllocateOnly: // Allocation is handled above. Nothing more to do. 495 case AllocateAndCommit: 496 if err := f.commitFile(fr); err != nil { 497 f.DecRef(fr) 498 return memmap.FileRange{}, err 499 } 500 case AllocateAndWritePopulate: 501 dsts, err = f.MapInternal(fr, hostarch.Write) 502 if err != nil { 503 f.DecRef(fr) 504 return memmap.FileRange{}, err 505 } 506 if canPopulate() { 507 rem := dsts 508 for { 509 if !tryPopulate(rem.Head()) { 510 break 511 } 512 rem = rem.Tail() 513 if rem.IsEmpty() { 514 break 515 } 516 } 517 } 518 default: 519 panic(fmt.Sprintf("unknown allocation mode: %d", opts.Mode)) 520 } 521 if opts.ReaderFunc != nil { 522 if dsts.IsEmpty() { 523 dsts, err = f.MapInternal(fr, hostarch.Write) 524 if err != nil { 525 f.DecRef(fr) 526 return memmap.FileRange{}, err 527 } 528 } 529 n, err := safemem.ReadFullToBlocks(opts.ReaderFunc, dsts) 530 un := uint64(hostarch.Addr(n).RoundDown()) 531 if un < length { 532 // Free unused memory and update fr to contain only the memory that is 533 // still allocated. 534 f.DecRef(memmap.FileRange{fr.Start + un, fr.End}) 535 fr.End = fr.Start + un 536 } 537 if err != nil { 538 return fr, err 539 } 540 } 541 return fr, nil 542 } 543 544 func (f *MemoryFile) allocate(length uint64, opts *AllocOpts) (memmap.FileRange, error) { 545 if length == 0 || length%hostarch.PageSize != 0 { 546 panic(fmt.Sprintf("invalid allocation length: %#x", length)) 547 } 548 549 f.mu.Lock() 550 defer f.mu.Unlock() 551 552 // Align hugepage-and-larger allocations on hugepage boundaries to try 553 // to take advantage of hugetmpfs. 554 alignment := uint64(hostarch.PageSize) 555 if length >= hostarch.HugePageSize { 556 alignment = hostarch.HugePageSize 557 } 558 559 // Find a range in the underlying file. 560 fr, ok := f.findAvailableRange(length, alignment, opts.Dir) 561 if !ok { 562 return memmap.FileRange{}, linuxerr.ENOMEM 563 } 564 565 // Expand the file if needed. 566 if int64(fr.End) > f.fileSize { 567 // Round the new file size up to be chunk-aligned. 568 newFileSize := (int64(fr.End) + chunkMask) &^ chunkMask 569 if err := f.file.Truncate(newFileSize); err != nil { 570 return memmap.FileRange{}, err 571 } 572 f.fileSize = newFileSize 573 f.mappingsMu.Lock() 574 oldMappings := *f.mappings.Load() 575 newMappings := make([]uintptr, newFileSize>>chunkShift) 576 copy(newMappings, oldMappings) 577 f.mappings.Store(&newMappings) 578 f.mappingsMu.Unlock() 579 } 580 581 if f.opts.ManualZeroing { 582 if err := f.manuallyZero(fr); err != nil { 583 return memmap.FileRange{}, err 584 } 585 } 586 // Mark selected pages as in use. 587 f.usage.InsertRange(fr, usageInfo{ 588 kind: opts.Kind, 589 refs: 1, 590 memCgID: opts.MemCgID, 591 }) 592 593 return fr, nil 594 } 595 596 // findAvailableRange returns an available range in the usageSet. 597 // 598 // Note that scanning for available slots takes place from end first backwards, 599 // then forwards. This heuristic has important consequence for how sequential 600 // mappings can be merged in the host VMAs, given that addresses for both 601 // application and sentry mappings are allocated top-down (from higher to 602 // lower addresses). The file is also grown exponentially in order to create 603 // space for mappings to be allocated downwards. 604 // 605 // Precondition: alignment must be a power of 2. 606 func (f *MemoryFile) findAvailableRange(length, alignment uint64, dir Direction) (memmap.FileRange, bool) { 607 if dir == BottomUp { 608 return findAvailableRangeBottomUp(&f.usage, length, alignment) 609 } 610 return findAvailableRangeTopDown(&f.usage, f.fileSize, length, alignment) 611 } 612 613 func findAvailableRangeTopDown(usage *usageSet, fileSize int64, length, alignment uint64) (memmap.FileRange, bool) { 614 alignmentMask := alignment - 1 615 616 // Search for space in existing gaps, starting at the current end of the 617 // file and working backward. 618 lastGap := usage.LastGap() 619 gap := lastGap 620 for { 621 end := gap.End() 622 if end > uint64(fileSize) { 623 end = uint64(fileSize) 624 } 625 626 // Try to allocate from the end of this gap, with the start of the 627 // allocated range aligned down to alignment. 628 unalignedStart := end - length 629 if unalignedStart > end { 630 // Negative overflow: this and all preceding gaps are too small to 631 // accommodate length. 632 break 633 } 634 if start := unalignedStart &^ alignmentMask; start >= gap.Start() { 635 return memmap.FileRange{start, start + length}, true 636 } 637 638 gap = gap.PrevLargeEnoughGap(length) 639 if !gap.Ok() { 640 break 641 } 642 } 643 644 // Check that it's possible to fit this allocation at the end of a file of any size. 645 min := lastGap.Start() 646 min = (min + alignmentMask) &^ alignmentMask 647 if min+length < min { 648 // Overflow: allocation would exceed the range of uint64. 649 return memmap.FileRange{}, false 650 } 651 652 // Determine the minimum file size required to fit this allocation at its end. 653 for { 654 newFileSize := 2 * fileSize 655 if newFileSize <= fileSize { 656 if fileSize != 0 { 657 // Overflow: allocation would exceed the range of int64. 658 return memmap.FileRange{}, false 659 } 660 newFileSize = chunkSize 661 } 662 fileSize = newFileSize 663 664 unalignedStart := uint64(fileSize) - length 665 if unalignedStart > uint64(fileSize) { 666 // Negative overflow: fileSize is still inadequate. 667 continue 668 } 669 if start := unalignedStart &^ alignmentMask; start >= min { 670 return memmap.FileRange{start, start + length}, true 671 } 672 } 673 } 674 675 func findAvailableRangeBottomUp(usage *usageSet, length, alignment uint64) (memmap.FileRange, bool) { 676 alignmentMask := alignment - 1 677 for gap := usage.FirstGap(); gap.Ok(); gap = gap.NextLargeEnoughGap(length) { 678 // Align the start address and check if allocation still fits in the gap. 679 start := (gap.Start() + alignmentMask) &^ alignmentMask 680 681 // File offsets are int64s. Since length must be strictly positive, end 682 // cannot legitimately be 0. 683 end := start + length 684 if end < start || int64(end) <= 0 { 685 return memmap.FileRange{}, false 686 } 687 if end <= gap.End() { 688 return memmap.FileRange{start, end}, true 689 } 690 } 691 692 // NextLargeEnoughGap should have returned a gap at the end. 693 panic(fmt.Sprintf("NextLargeEnoughGap didn't return a gap at the end, length: %d", length)) 694 } 695 696 var mlockDisabled atomicbitops.Uint32 697 var madvPopulateWriteDisabled atomicbitops.Uint32 698 699 func canPopulate() bool { 700 return mlockDisabled.Load() == 0 || madvPopulateWriteDisabled.Load() == 0 701 } 702 703 func tryPopulateMadv(b safemem.Block) bool { 704 if madvPopulateWriteDisabled.Load() != 0 { 705 return false 706 } 707 start, ok := hostarch.Addr(b.Addr()).RoundUp() 708 if !ok { 709 return true 710 } 711 end := hostarch.Addr(b.Addr() + uintptr(b.Len())).RoundDown() 712 bLen := end - start 713 // Only call madvise(MADV_POPULATE_WRITE) if >=2 pages are being populated. 714 // 1 syscall overhead >= 1 page fault overhead. This is because syscalls are 715 // susceptible to additional overheads like seccomp-bpf filters and auditing. 716 if start >= end || bLen <= hostarch.PageSize { 717 return true 718 } 719 _, _, errno := unix.RawSyscall(unix.SYS_MADVISE, uintptr(start), uintptr(bLen), unix.MADV_POPULATE_WRITE) 720 if errno != 0 { 721 if errno == unix.EINVAL { 722 // EINVAL is expected if MADV_POPULATE_WRITE is not supported (Linux <5.14). 723 log.Infof("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: madvise failed: %s", errno) 724 } else { 725 log.Warningf("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: madvise failed: %s", errno) 726 } 727 madvPopulateWriteDisabled.Store(1) 728 return false 729 } 730 return true 731 } 732 733 func tryPopulateMlock(b safemem.Block) bool { 734 if mlockDisabled.Load() != 0 { 735 return false 736 } 737 // Call mlock to populate pages, then munlock to cancel the mlock (but keep 738 // the pages populated). Only do so for hugepage-aligned address ranges to 739 // ensure that splitting the VMA in mlock doesn't split any existing 740 // hugepages. This assumes that two host syscalls, plus the MM overhead of 741 // mlock + munlock, is faster on average than trapping for 742 // HugePageSize/PageSize small page faults. 743 start, ok := hostarch.Addr(b.Addr()).HugeRoundUp() 744 if !ok { 745 return true 746 } 747 end := hostarch.Addr(b.Addr() + uintptr(b.Len())).HugeRoundDown() 748 if start >= end { 749 return true 750 } 751 _, _, errno := unix.Syscall(unix.SYS_MLOCK, uintptr(start), uintptr(end-start), 0) 752 unix.RawSyscall(unix.SYS_MUNLOCK, uintptr(start), uintptr(end-start), 0) 753 if errno != 0 { 754 if errno == unix.ENOMEM || errno == unix.EPERM { 755 // These errors are expected from hitting non-zero RLIMIT_MEMLOCK, or 756 // hitting zero RLIMIT_MEMLOCK without CAP_IPC_LOCK, respectively. 757 log.Infof("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: mlock failed: %s", errno) 758 } else { 759 log.Warningf("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: mlock failed: %s", errno) 760 } 761 mlockDisabled.Store(1) 762 return false 763 } 764 return true 765 } 766 767 func tryPopulate(b safemem.Block) bool { 768 // There are two approaches for populating writable pages: 769 // 1. madvise(MADV_POPULATE_WRITE). It has the desired effect: "Populate 770 // (prefault) page tables writable, faulting in all pages in the range 771 // just as if manually writing to each each page". 772 // 2. Call mlock to populate pages, then munlock to cancel the mlock (but 773 // keep the pages populated). 774 // 775 // Prefer the madvise(MADV_POPULATE_WRITE) approach because: 776 // - Only requires 1 syscall, as opposed to 2 syscalls with mlock approach. 777 // - It is faster because it doesn't have to modify vmas like mlock does. 778 // - It works for disk-backed memory mappings too. The mlock approach doesn't 779 // work for disk-backed filesystems (e.g. ext4). This is because 780 // mlock(2) => mm/gup.c:__mm_populate() emulates a read fault on writable 781 // MAP_SHARED mappings. For memory-backed (shmem) files, 782 // mm/mmap.c:vma_set_page_prot() => vma_wants_writenotify() is false, so 783 // the page table entries populated by a read fault are writable. For 784 // disk-backed files, vma_set_page_prot() => vma_wants_writenotify() is 785 // true, so the page table entries populated by a read fault are read-only. 786 if tryPopulateMadv(b) { 787 return true 788 } 789 return tryPopulateMlock(b) 790 } 791 792 // fallocate(2) modes, defined in Linux's include/uapi/linux/falloc.h. 793 const ( 794 _FALLOC_FL_KEEP_SIZE = 1 795 _FALLOC_FL_PUNCH_HOLE = 2 796 ) 797 798 // Decommit releases resources associated with maintaining the contents of the 799 // given pages. If Decommit succeeds, future accesses of the decommitted pages 800 // will read zeroes. 801 // 802 // Preconditions: fr.Length() > 0. 803 func (f *MemoryFile) Decommit(fr memmap.FileRange) error { 804 if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 { 805 panic(fmt.Sprintf("invalid range: %v", fr)) 806 } 807 808 if f.opts.ManualZeroing { 809 // FALLOC_FL_PUNCH_HOLE may not zero pages if ManualZeroing is in 810 // effect. 811 if err := f.manuallyZero(fr); err != nil { 812 return err 813 } 814 } else { 815 if err := f.decommitFile(fr); err != nil { 816 return err 817 } 818 } 819 820 f.markDecommitted(fr) 821 return nil 822 } 823 824 func (f *MemoryFile) manuallyZero(fr memmap.FileRange) error { 825 return f.forEachMappingSlice(fr, func(bs []byte) { 826 for i := range bs { 827 bs[i] = 0 828 } 829 }) 830 } 831 832 func (f *MemoryFile) commitFile(fr memmap.FileRange) error { 833 // "The default operation (i.e., mode is zero) of fallocate() allocates the 834 // disk space within the range specified by offset and len." - fallocate(2) 835 return unix.Fallocate( 836 int(f.file.Fd()), 837 0, // mode 838 int64(fr.Start), 839 int64(fr.Length())) 840 } 841 842 func (f *MemoryFile) decommitFile(fr memmap.FileRange) error { 843 // "After a successful call, subsequent reads from this range will 844 // return zeroes. The FALLOC_FL_PUNCH_HOLE flag must be ORed with 845 // FALLOC_FL_KEEP_SIZE in mode ..." - fallocate(2) 846 return unix.Fallocate( 847 int(f.file.Fd()), 848 _FALLOC_FL_PUNCH_HOLE|_FALLOC_FL_KEEP_SIZE, 849 int64(fr.Start), 850 int64(fr.Length())) 851 } 852 853 func (f *MemoryFile) markDecommitted(fr memmap.FileRange) { 854 f.mu.Lock() 855 defer f.mu.Unlock() 856 // Since we're changing the knownCommitted attribute, we need to merge 857 // across the entire range to ensure that the usage tree is minimal. 858 f.usage.MutateFullRange(fr, func(seg usageIterator) bool { 859 val := seg.ValuePtr() 860 if val.knownCommitted { 861 // Drop the usageExpected appropriately. 862 amount := seg.Range().Length() 863 usage.MemoryAccounting.Dec(amount, val.kind, val.memCgID) 864 f.usageExpected -= amount 865 val.knownCommitted = false 866 } 867 val.memCgID = 0 868 return true 869 }) 870 } 871 872 // HasUniqueRef returns true if all pages in the given range have exactly one 873 // reference. A return value of false is inherently racy, but if the caller 874 // holds a reference on the given range and is preventing other goroutines from 875 // copying it, then a return value of true is not racy. 876 // 877 // Preconditions: At least one reference must be held on all pages in fr. 878 func (f *MemoryFile) HasUniqueRef(fr memmap.FileRange) bool { 879 f.mu.Lock() 880 defer f.mu.Unlock() 881 hasUniqueRef := true 882 f.usage.VisitFullRange(fr, func(seg usageIterator) bool { 883 if seg.ValuePtr().refs != 1 { 884 hasUniqueRef = false 885 return false 886 } 887 return true 888 }) 889 return hasUniqueRef 890 } 891 892 // IncRef implements memmap.File.IncRef. 893 func (f *MemoryFile) IncRef(fr memmap.FileRange, memCgID uint32) { 894 if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 { 895 panic(fmt.Sprintf("invalid range: %v", fr)) 896 } 897 898 f.mu.Lock() 899 defer f.mu.Unlock() 900 901 f.usage.MutateFullRange(fr, func(seg usageIterator) bool { 902 seg.ValuePtr().refs++ 903 return true 904 }) 905 } 906 907 // DecRef implements memmap.File.DecRef. 908 func (f *MemoryFile) DecRef(fr memmap.FileRange) { 909 if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 { 910 panic(fmt.Sprintf("invalid range: %v", fr)) 911 } 912 913 var freed bool 914 915 f.mu.Lock() 916 defer f.mu.Unlock() 917 918 f.usage.MutateFullRange(fr, func(seg usageIterator) bool { 919 val := seg.ValuePtr() 920 if val.refs == 0 { 921 panic(fmt.Sprintf("DecRef(%v): 0 existing references on %v:\n%v", fr, seg.Range(), &f.usage)) 922 } 923 val.refs-- 924 if val.refs == 0 { 925 f.reclaim.InsertRange(seg.Range(), reclaimSetValue{}) 926 freed = true 927 // Reclassify memory as System, until it's freed by the reclaim 928 // goroutine. 929 if val.knownCommitted { 930 usage.MemoryAccounting.Move(seg.Range().Length(), usage.System, val.kind, val.memCgID) 931 } 932 val.kind = usage.System 933 } 934 return true 935 }) 936 937 if freed { 938 f.reclaimable = true 939 f.reclaimCond.Signal() 940 } 941 } 942 943 // MapInternal implements memmap.File.MapInternal. 944 func (f *MemoryFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { 945 if !fr.WellFormed() || fr.Length() == 0 { 946 panic(fmt.Sprintf("invalid range: %v", fr)) 947 } 948 if at.Execute { 949 return safemem.BlockSeq{}, linuxerr.EACCES 950 } 951 952 chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift) 953 if chunks == 1 { 954 // Avoid an unnecessary slice allocation. 955 var seq safemem.BlockSeq 956 err := f.forEachMappingSlice(fr, func(bs []byte) { 957 seq = safemem.BlockSeqOf(safemem.BlockFromSafeSlice(bs)) 958 }) 959 return seq, err 960 } 961 blocks := make([]safemem.Block, 0, chunks) 962 err := f.forEachMappingSlice(fr, func(bs []byte) { 963 blocks = append(blocks, safemem.BlockFromSafeSlice(bs)) 964 }) 965 return safemem.BlockSeqFromSlice(blocks), err 966 } 967 968 // forEachMappingSlice invokes fn on a sequence of byte slices that 969 // collectively map all bytes in fr. 970 func (f *MemoryFile) forEachMappingSlice(fr memmap.FileRange, fn func([]byte)) error { 971 mappings := *f.mappings.Load() 972 for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize { 973 chunk := int(chunkStart >> chunkShift) 974 m := atomic.LoadUintptr(&mappings[chunk]) 975 if m == 0 { 976 var err error 977 mappings, m, err = f.getChunkMapping(chunk) 978 if err != nil { 979 return err 980 } 981 } 982 startOff := uint64(0) 983 if chunkStart < fr.Start { 984 startOff = fr.Start - chunkStart 985 } 986 endOff := uint64(chunkSize) 987 if chunkStart+chunkSize > fr.End { 988 endOff = fr.End - chunkStart 989 } 990 fn(unsafeSlice(m, chunkSize)[startOff:endOff]) 991 } 992 return nil 993 } 994 995 func (f *MemoryFile) getChunkMapping(chunk int) ([]uintptr, uintptr, error) { 996 f.mappingsMu.Lock() 997 defer f.mappingsMu.Unlock() 998 // Another thread may have replaced f.mappings altogether due to file 999 // expansion. 1000 mappings := *f.mappings.Load() 1001 // Another thread may have already mapped the chunk. 1002 if m := mappings[chunk]; m != 0 { 1003 return mappings, m, nil 1004 } 1005 m, _, errno := unix.Syscall6( 1006 unix.SYS_MMAP, 1007 0, 1008 chunkSize, 1009 unix.PROT_READ|unix.PROT_WRITE, 1010 unix.MAP_SHARED, 1011 f.file.Fd(), 1012 uintptr(chunk<<chunkShift)) 1013 if errno != 0 { 1014 return nil, 0, errno 1015 } 1016 atomic.StoreUintptr(&mappings[chunk], m) 1017 return mappings, m, nil 1018 } 1019 1020 // MarkEvictable allows f to request memory deallocation by calling 1021 // user.Evict(er) in the future. 1022 // 1023 // Redundantly marking an already-evictable range as evictable has no effect. 1024 func (f *MemoryFile) MarkEvictable(user EvictableMemoryUser, er EvictableRange) { 1025 f.mu.Lock() 1026 defer f.mu.Unlock() 1027 info, ok := f.evictable[user] 1028 if !ok { 1029 info = &evictableMemoryUserInfo{} 1030 f.evictable[user] = info 1031 } 1032 gap := info.ranges.LowerBoundGap(er.Start) 1033 for gap.Ok() && gap.Start() < er.End { 1034 gapER := gap.Range().Intersect(er) 1035 if gapER.Length() == 0 { 1036 gap = gap.NextGap() 1037 continue 1038 } 1039 gap = info.ranges.Insert(gap, gapER, evictableRangeSetValue{}).NextGap() 1040 } 1041 if !info.evicting { 1042 switch f.opts.DelayedEviction { 1043 case DelayedEvictionDisabled: 1044 // Kick off eviction immediately. 1045 f.startEvictionGoroutineLocked(user, info) 1046 case DelayedEvictionEnabled: 1047 if !f.opts.UseHostMemcgPressure { 1048 // Ensure that the reclaimer goroutine is running, so that it 1049 // can start eviction when necessary. 1050 f.reclaimCond.Signal() 1051 } 1052 } 1053 } 1054 } 1055 1056 // MarkUnevictable informs f that user no longer considers er to be evictable, 1057 // so the MemoryFile should no longer call user.Evict(er). Note that, per 1058 // EvictableMemoryUser.Evict's documentation, user.Evict(er) may still be 1059 // called even after MarkUnevictable returns due to race conditions, and 1060 // implementations of EvictableMemoryUser must handle this possibility. 1061 // 1062 // Redundantly marking an already-unevictable range as unevictable has no 1063 // effect. 1064 func (f *MemoryFile) MarkUnevictable(user EvictableMemoryUser, er EvictableRange) { 1065 f.mu.Lock() 1066 defer f.mu.Unlock() 1067 info, ok := f.evictable[user] 1068 if !ok { 1069 return 1070 } 1071 seg := info.ranges.LowerBoundSegment(er.Start) 1072 for seg.Ok() && seg.Start() < er.End { 1073 seg = info.ranges.Isolate(seg, er) 1074 seg = info.ranges.Remove(seg).NextSegment() 1075 } 1076 // We can only remove info if there's no eviction goroutine running on its 1077 // behalf. 1078 if !info.evicting && info.ranges.IsEmpty() { 1079 delete(f.evictable, user) 1080 } 1081 } 1082 1083 // MarkAllUnevictable informs f that user no longer considers any offsets to be 1084 // evictable. It otherwise has the same semantics as MarkUnevictable. 1085 func (f *MemoryFile) MarkAllUnevictable(user EvictableMemoryUser) { 1086 f.mu.Lock() 1087 defer f.mu.Unlock() 1088 info, ok := f.evictable[user] 1089 if !ok { 1090 return 1091 } 1092 info.ranges.RemoveAll() 1093 // We can only remove info if there's no eviction goroutine running on its 1094 // behalf. 1095 if !info.evicting { 1096 delete(f.evictable, user) 1097 } 1098 } 1099 1100 // ShouldCacheEvictable returns true if f is meaningfully delaying evictions of 1101 // evictable memory, such that it may be advantageous to cache data in 1102 // evictable memory. The value returned by ShouldCacheEvictable may change 1103 // between calls. 1104 func (f *MemoryFile) ShouldCacheEvictable() bool { 1105 return f.opts.DelayedEviction == DelayedEvictionManual || f.opts.UseHostMemcgPressure 1106 } 1107 1108 // UpdateUsage ensures that the memory usage statistics in 1109 // usage.MemoryAccounting are up to date. If memCgIDs is nil, all the pages 1110 // will be scanned. Else only the pages which belong to the memory cgroup ids 1111 // in memCgIDs will be scanned and the memory usage will be updated. 1112 func (f *MemoryFile) UpdateUsage(memCgIDs map[uint32]struct{}) error { 1113 f.mu.Lock() 1114 defer f.mu.Unlock() 1115 1116 // If the underlying usage matches where the usage tree already 1117 // represents, then we can just avoid the entire scan (we know it's 1118 // accurate). 1119 currentUsage, err := f.TotalUsage() 1120 if err != nil { 1121 return err 1122 } 1123 if currentUsage == f.usageExpected && f.usageSwapped == 0 { 1124 log.Debugf("UpdateUsage: skipped with usageSwapped=0.") 1125 return nil 1126 } 1127 // If the current usage matches the expected but there's swap 1128 // accounting, then ensure a scan takes place at least every second 1129 // (when requested). 1130 if currentUsage == f.usageExpected+f.usageSwapped && time.Now().Before(f.usageLast.Add(time.Second)) { 1131 log.Debugf("UpdateUsage: skipped with usageSwapped!=0.") 1132 return nil 1133 } 1134 1135 // Linux updates usage values at CONFIG_HZ. 1136 if scanningAfter := time.Now().Sub(f.usageLast).Milliseconds(); scanningAfter < time.Second.Milliseconds()/linux.CLOCKS_PER_SEC { 1137 log.Debugf("UpdateUsage: skipped because previous scan happened %d ms back", scanningAfter) 1138 return nil 1139 } 1140 1141 if memCgIDs == nil { 1142 f.usageLast = time.Now() 1143 } 1144 err = f.updateUsageLocked(currentUsage, memCgIDs, mincore) 1145 log.Debugf("UpdateUsage: currentUsage=%d, usageExpected=%d, usageSwapped=%d.", 1146 currentUsage, f.usageExpected, f.usageSwapped) 1147 log.Debugf("UpdateUsage: took %v.", time.Since(f.usageLast)) 1148 return err 1149 } 1150 1151 // updateUsageLocked attempts to detect commitment of previous-uncommitted 1152 // pages by invoking checkCommitted, which is a function that, for each page i 1153 // in bs, sets committed[i] to 1 if the page is committed and 0 otherwise. 1154 // 1155 // Precondition: f.mu must be held; it may be unlocked and reacquired. 1156 // +checklocks:f.mu 1157 func (f *MemoryFile) updateUsageLocked(currentUsage uint64, memCgIDs map[uint32]struct{}, checkCommitted func(bs []byte, committed []byte) error) error { 1158 // Track if anything changed to elide the merge. In the common case, we 1159 // expect all segments to be committed and no merge to occur. 1160 changedAny := false 1161 defer func() { 1162 if changedAny { 1163 f.usage.MergeAll() 1164 } 1165 1166 // Adjust the swap usage to reflect reality. 1167 if f.usageExpected < currentUsage { 1168 // Since no pages may be marked decommitted while we hold mu, we 1169 // know that usage may have only increased since we got the last 1170 // current usage. Therefore, if usageExpected is still short of 1171 // currentUsage, we must assume that the difference is in pages 1172 // that have been swapped. 1173 newUsageSwapped := currentUsage - f.usageExpected 1174 if f.usageSwapped < newUsageSwapped { 1175 usage.MemoryAccounting.Inc(newUsageSwapped-f.usageSwapped, usage.System, 0) 1176 } else { 1177 usage.MemoryAccounting.Dec(f.usageSwapped-newUsageSwapped, usage.System, 0) 1178 } 1179 f.usageSwapped = newUsageSwapped 1180 } else if f.usageSwapped != 0 { 1181 // We have more usage accounted for than the file itself. 1182 // That's fine, we probably caught a race where pages were 1183 // being committed while the below loop was running. Just 1184 // report the higher number that we found and ignore swap. 1185 usage.MemoryAccounting.Dec(f.usageSwapped, usage.System, 0) 1186 f.usageSwapped = 0 1187 } 1188 }() 1189 1190 // Reused mincore buffer, will generally be <= 4096 bytes. 1191 var buf []byte 1192 1193 // Iterate over all usage data. There will only be usage segments 1194 // present when there is an associated reference. 1195 for seg := f.usage.FirstSegment(); seg.Ok(); { 1196 if !seg.ValuePtr().canCommit() { 1197 seg = seg.NextSegment() 1198 continue 1199 } 1200 1201 // Scan the pages of the given memCgID only. This will avoid scanning the 1202 // whole memory file when the memory usage is required only for a specific 1203 // cgroup. The total memory usage of all cgroups can be obtained when the 1204 // memCgIDs is nil. 1205 if memCgIDs != nil { 1206 if _, ok := memCgIDs[seg.ValuePtr().memCgID]; !ok { 1207 seg = seg.NextSegment() 1208 continue 1209 } 1210 } 1211 1212 // Get the range for this segment. As we touch slices, the 1213 // Start value will be walked along. 1214 r := seg.Range() 1215 1216 var checkErr error 1217 err := f.forEachMappingSlice(r, 1218 func(s []byte) { 1219 if checkErr != nil { 1220 return 1221 } 1222 1223 // Ensure that we have sufficient buffer for the call 1224 // (one byte per page). The length of each slice must 1225 // be page-aligned. 1226 bufLen := len(s) / hostarch.PageSize 1227 if len(buf) < bufLen { 1228 buf = make([]byte, bufLen) 1229 } 1230 1231 // Query for new pages in core. 1232 // NOTE(b/165896008): mincore (which is passed as checkCommitted) 1233 // by f.UpdateUsage() might take a really long time. So unlock f.mu 1234 // while checkCommitted runs. 1235 f.mu.Unlock() // +checklocksforce 1236 err := checkCommitted(s, buf) 1237 f.mu.Lock() 1238 if err != nil { 1239 checkErr = err 1240 return 1241 } 1242 1243 // Scan each page and switch out segments. 1244 seg := f.usage.LowerBoundSegment(r.Start) 1245 for i := 0; i < bufLen; { 1246 if buf[i]&0x1 == 0 { 1247 i++ 1248 continue 1249 } 1250 // Scan to the end of this committed range. 1251 j := i + 1 1252 for ; j < bufLen; j++ { 1253 if buf[j]&0x1 == 0 { 1254 break 1255 } 1256 } 1257 committedFR := memmap.FileRange{ 1258 Start: r.Start + uint64(i*hostarch.PageSize), 1259 End: r.Start + uint64(j*hostarch.PageSize), 1260 } 1261 // Advance seg to committedFR.Start. 1262 for seg.Ok() && seg.End() < committedFR.Start { 1263 seg = seg.NextSegment() 1264 } 1265 // Mark pages overlapping committedFR as committed. 1266 for seg.Ok() && seg.Start() < committedFR.End { 1267 if seg.ValuePtr().canCommit() { 1268 seg = f.usage.Isolate(seg, committedFR) 1269 seg.ValuePtr().knownCommitted = true 1270 amount := seg.Range().Length() 1271 usage.MemoryAccounting.Inc(amount, seg.ValuePtr().kind, seg.ValuePtr().memCgID) 1272 f.usageExpected += amount 1273 changedAny = true 1274 } 1275 seg = seg.NextSegment() 1276 } 1277 // Continue scanning for committed pages. 1278 i = j + 1 1279 } 1280 1281 // Advance r.Start. 1282 r.Start += uint64(len(s)) 1283 }) 1284 if checkErr != nil { 1285 return checkErr 1286 } 1287 if err != nil { 1288 return err 1289 } 1290 1291 // Continue with the first segment after r.End. 1292 seg = f.usage.LowerBoundSegment(r.End) 1293 } 1294 1295 return nil 1296 } 1297 1298 // TotalUsage returns an aggregate usage for all memory statistics except 1299 // Mapped (which is external to MemoryFile). This is generally much cheaper 1300 // than UpdateUsage, but will not provide a fine-grained breakdown. 1301 func (f *MemoryFile) TotalUsage() (uint64, error) { 1302 // Stat the underlying file to discover the underlying usage. stat(2) 1303 // always reports the allocated block count in units of 512 bytes. This 1304 // includes pages in the page cache and swapped pages. 1305 var stat unix.Stat_t 1306 if err := unix.Fstat(int(f.file.Fd()), &stat); err != nil { 1307 return 0, err 1308 } 1309 return uint64(stat.Blocks * 512), nil 1310 } 1311 1312 // TotalSize returns the current size of the backing file in bytes, which is an 1313 // upper bound on the amount of memory that can currently be allocated from the 1314 // MemoryFile. The value returned by TotalSize is permitted to change. 1315 func (f *MemoryFile) TotalSize() uint64 { 1316 f.mu.Lock() 1317 defer f.mu.Unlock() 1318 return uint64(f.fileSize) 1319 } 1320 1321 // File returns the backing file. 1322 func (f *MemoryFile) File() *os.File { 1323 return f.file 1324 } 1325 1326 // FD implements memmap.File.FD. 1327 func (f *MemoryFile) FD() int { 1328 return int(f.file.Fd()) 1329 } 1330 1331 // IsDiskBacked returns true if f is backed by a file on disk. 1332 func (f *MemoryFile) IsDiskBacked() bool { 1333 return f.opts.DiskBackedFile 1334 } 1335 1336 // String implements fmt.Stringer.String. 1337 // 1338 // Note that because f.String locks f.mu, calling f.String internally 1339 // (including indirectly through the fmt package) risks recursive locking. 1340 // Within the pgalloc package, use f.usage directly instead. 1341 func (f *MemoryFile) String() string { 1342 f.mu.Lock() 1343 defer f.mu.Unlock() 1344 return f.usage.String() 1345 } 1346 1347 // runReclaim implements the reclaimer goroutine, which continuously decommits 1348 // reclaimable pages in order to reduce memory usage and make them available 1349 // for allocation. 1350 func (f *MemoryFile) runReclaim() { 1351 for { 1352 // N.B. We must call f.markReclaimed on the returned FrameRange. 1353 fr, ok := f.findReclaimable() 1354 if !ok { 1355 break 1356 } 1357 1358 if f.opts.ManualZeroing { 1359 // If ManualZeroing is in effect, only hugepage-aligned regions may 1360 // be safely passed to decommitFile. Pages will be zeroed on 1361 // reallocation, so we don't need to perform any manual zeroing 1362 // here, whether or not decommitFile succeeds. 1363 if startAddr, ok := hostarch.Addr(fr.Start).HugeRoundUp(); ok { 1364 if endAddr := hostarch.Addr(fr.End).HugeRoundDown(); startAddr < endAddr { 1365 decommitFR := memmap.FileRange{uint64(startAddr), uint64(endAddr)} 1366 if err := f.decommitFile(decommitFR); err != nil { 1367 log.Warningf("Reclaim failed to decommit %v: %v", decommitFR, err) 1368 } 1369 } 1370 } 1371 } else { 1372 if err := f.decommitFile(fr); err != nil { 1373 log.Warningf("Reclaim failed to decommit %v: %v", fr, err) 1374 // Zero the pages manually. This won't reduce memory usage, but at 1375 // least ensures that the pages will be zero when reallocated. 1376 if err := f.manuallyZero(fr); err != nil { 1377 panic(fmt.Sprintf("Reclaim failed to decommit or zero %v: %v", fr, err)) 1378 } 1379 } 1380 } 1381 f.markDecommitted(fr) 1382 f.markReclaimed(fr) 1383 } 1384 1385 // We only get here if findReclaimable finds f.destroyed set and returns 1386 // false. 1387 f.mu.Lock() 1388 if !f.destroyed { 1389 f.mu.Unlock() 1390 panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set") 1391 } 1392 if f.opts.DecommitOnDestroy && f.fileSize > 0 { 1393 if err := f.decommitFile(memmap.FileRange{Start: 0, End: uint64(f.fileSize)}); err != nil { 1394 f.mu.Unlock() 1395 panic(fmt.Sprintf("failed to decommit entire memory file during destruction: %v", err)) 1396 } 1397 } 1398 f.file.Close() 1399 // Ensure that any attempts to use f.file.Fd() fail instead of getting a fd 1400 // that has possibly been reassigned. 1401 f.file = nil 1402 f.mappingsMu.Lock() 1403 defer f.mappingsMu.Unlock() 1404 mappings := *f.mappings.Load() 1405 for i, m := range mappings { 1406 if m != 0 { 1407 _, _, errno := unix.Syscall(unix.SYS_MUNMAP, m, chunkSize, 0) 1408 if errno != 0 { 1409 log.Warningf("Failed to unmap mapping %#x for MemoryFile chunk %d: %v", m, i, errno) 1410 } 1411 } 1412 } 1413 // Similarly, invalidate f.mappings 1414 f.mappings.Store(nil) 1415 f.mu.Unlock() 1416 1417 // This must be called without holding f.mu to avoid circular lock 1418 // ordering. 1419 if f.stopNotifyPressure != nil { 1420 f.stopNotifyPressure() 1421 } 1422 } 1423 1424 // findReclaimable finds memory that has been marked for reclaim. 1425 // 1426 // Note that there returned range will be removed from tracking. It 1427 // must be reclaimed (removed from f.usage) at this point. 1428 func (f *MemoryFile) findReclaimable() (memmap.FileRange, bool) { 1429 f.mu.Lock() 1430 defer f.mu.Unlock() 1431 for { 1432 for { 1433 if f.destroyed { 1434 return memmap.FileRange{}, false 1435 } 1436 if f.reclaimable { 1437 break 1438 } 1439 if f.opts.DelayedEviction == DelayedEvictionEnabled && !f.opts.UseHostMemcgPressure { 1440 // No work to do. Evict any pending evictable allocations to 1441 // get more reclaimable pages before going to sleep. 1442 f.startEvictionsLocked() 1443 } 1444 f.reclaimCond.Wait() 1445 } 1446 // Most allocations are done upwards, with exceptions being stacks and some 1447 // allocators that allocate top-down. Reclaim preserves this order to 1448 // minimize the cost of the search. 1449 if seg := f.reclaim.FirstSegment(); seg.Ok() { 1450 fr := seg.Range() 1451 f.reclaim.Remove(seg) 1452 return fr, true 1453 } 1454 // Nothing is reclaimable. 1455 f.reclaimable = false 1456 } 1457 } 1458 1459 func (f *MemoryFile) markReclaimed(fr memmap.FileRange) { 1460 f.mu.Lock() 1461 defer f.mu.Unlock() 1462 seg := f.usage.FindSegment(fr.Start) 1463 // All of fr should be mapped to a single uncommitted reclaimable 1464 // segment accounted to System. 1465 if !seg.Ok() { 1466 panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage)) 1467 } 1468 if !seg.Range().IsSupersetOf(fr) { 1469 panic(fmt.Sprintf("reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage)) 1470 } 1471 if got, want := seg.Value(), (usageInfo{ 1472 kind: usage.System, 1473 knownCommitted: false, 1474 refs: 0, 1475 memCgID: 0, 1476 }); got != want { 1477 panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage)) 1478 } 1479 // Deallocate reclaimed pages. Even though all of seg is reclaimable, 1480 // the caller of markReclaimed may not have decommitted it, so we can 1481 // only mark fr as reclaimed. 1482 f.usage.Remove(f.usage.Isolate(seg, fr)) 1483 } 1484 1485 // StartEvictions requests that f evict all evictable allocations. It does not 1486 // wait for eviction to complete; for this, see MemoryFile.WaitForEvictions. 1487 func (f *MemoryFile) StartEvictions() { 1488 f.mu.Lock() 1489 defer f.mu.Unlock() 1490 f.startEvictionsLocked() 1491 } 1492 1493 // Preconditions: f.mu must be locked. 1494 func (f *MemoryFile) startEvictionsLocked() bool { 1495 startedAny := false 1496 for user, info := range f.evictable { 1497 // Don't start multiple goroutines to evict the same user's 1498 // allocations. 1499 if !info.evicting { 1500 f.startEvictionGoroutineLocked(user, info) 1501 startedAny = true 1502 } 1503 } 1504 return startedAny 1505 } 1506 1507 // Preconditions: 1508 // - info == f.evictable[user]. 1509 // - !info.evicting. 1510 // - f.mu must be locked. 1511 func (f *MemoryFile) startEvictionGoroutineLocked(user EvictableMemoryUser, info *evictableMemoryUserInfo) { 1512 info.evicting = true 1513 f.evictionWG.Add(1) 1514 go func() { // S/R-SAFE: f.evictionWG 1515 defer f.evictionWG.Done() 1516 for { 1517 f.mu.Lock() 1518 info, ok := f.evictable[user] 1519 if !ok { 1520 // This shouldn't happen: only this goroutine is permitted 1521 // to delete this entry. 1522 f.mu.Unlock() 1523 panic(fmt.Sprintf("evictableMemoryUserInfo for EvictableMemoryUser %v deleted while eviction goroutine running", user)) 1524 } 1525 if info.ranges.IsEmpty() { 1526 delete(f.evictable, user) 1527 f.mu.Unlock() 1528 return 1529 } 1530 // Evict from the end of info.ranges, under the assumption that 1531 // if ranges in user start being used again (and are 1532 // consequently marked unevictable), such uses are more likely 1533 // to start from the beginning of user. 1534 seg := info.ranges.LastSegment() 1535 er := seg.Range() 1536 info.ranges.Remove(seg) 1537 // user.Evict() must be called without holding f.mu to avoid 1538 // circular lock ordering. 1539 f.mu.Unlock() 1540 user.Evict(context.Background(), er) 1541 } 1542 }() 1543 } 1544 1545 // WaitForEvictions blocks until f is no longer evicting any evictable 1546 // allocations. 1547 func (f *MemoryFile) WaitForEvictions() { 1548 f.evictionWG.Wait() 1549 } 1550 1551 type usageSetFunctions struct{} 1552 1553 func (usageSetFunctions) MinKey() uint64 { 1554 return 0 1555 } 1556 1557 func (usageSetFunctions) MaxKey() uint64 { 1558 return math.MaxUint64 1559 } 1560 1561 func (usageSetFunctions) ClearValue(val *usageInfo) { 1562 } 1563 1564 func (usageSetFunctions) Merge(_ memmap.FileRange, val1 usageInfo, _ memmap.FileRange, val2 usageInfo) (usageInfo, bool) { 1565 return val1, val1 == val2 1566 } 1567 1568 func (usageSetFunctions) Split(_ memmap.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) { 1569 return val, val 1570 } 1571 1572 // evictableRangeSetValue is the value type of evictableRangeSet. 1573 type evictableRangeSetValue struct{} 1574 1575 type evictableRangeSetFunctions struct{} 1576 1577 func (evictableRangeSetFunctions) MinKey() uint64 { 1578 return 0 1579 } 1580 1581 func (evictableRangeSetFunctions) MaxKey() uint64 { 1582 return math.MaxUint64 1583 } 1584 1585 func (evictableRangeSetFunctions) ClearValue(val *evictableRangeSetValue) { 1586 } 1587 1588 func (evictableRangeSetFunctions) Merge(_ EvictableRange, _ evictableRangeSetValue, _ EvictableRange, _ evictableRangeSetValue) (evictableRangeSetValue, bool) { 1589 return evictableRangeSetValue{}, true 1590 } 1591 1592 func (evictableRangeSetFunctions) Split(_ EvictableRange, _ evictableRangeSetValue, _ uint64) (evictableRangeSetValue, evictableRangeSetValue) { 1593 return evictableRangeSetValue{}, evictableRangeSetValue{} 1594 } 1595 1596 // reclaimSetValue is the value type of reclaimSet. 1597 type reclaimSetValue struct{} 1598 1599 type reclaimSetFunctions struct{} 1600 1601 func (reclaimSetFunctions) MinKey() uint64 { 1602 return 0 1603 } 1604 1605 func (reclaimSetFunctions) MaxKey() uint64 { 1606 return math.MaxUint64 1607 } 1608 1609 func (reclaimSetFunctions) ClearValue(val *reclaimSetValue) { 1610 } 1611 1612 func (reclaimSetFunctions) Merge(_ memmap.FileRange, _ reclaimSetValue, _ memmap.FileRange, _ reclaimSetValue) (reclaimSetValue, bool) { 1613 return reclaimSetValue{}, true 1614 } 1615 1616 func (reclaimSetFunctions) Split(_ memmap.FileRange, _ reclaimSetValue, _ uint64) (reclaimSetValue, reclaimSetValue) { 1617 return reclaimSetValue{}, reclaimSetValue{} 1618 }