gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/pgalloc/pgalloc.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package pgalloc contains the page allocator subsystem, which manages memory 16 // that may be mapped into application address spaces. 17 // 18 // Lock order: 19 // 20 // pgalloc.MemoryFile.mu 21 // pgalloc.MemoryFile.mappingsMu 22 package pgalloc 23 24 import ( 25 "fmt" 26 "math" 27 "os" 28 "sync/atomic" 29 "time" 30 31 "golang.org/x/sys/unix" 32 "gvisor.dev/gvisor/pkg/abi/linux" 33 "gvisor.dev/gvisor/pkg/atomicbitops" 34 "gvisor.dev/gvisor/pkg/context" 35 "gvisor.dev/gvisor/pkg/errors/linuxerr" 36 "gvisor.dev/gvisor/pkg/hostarch" 37 "gvisor.dev/gvisor/pkg/log" 38 "gvisor.dev/gvisor/pkg/safemem" 39 "gvisor.dev/gvisor/pkg/sentry/hostmm" 40 "gvisor.dev/gvisor/pkg/sentry/memmap" 41 "gvisor.dev/gvisor/pkg/sentry/usage" 42 "gvisor.dev/gvisor/pkg/sync" 43 ) 44 45 // Direction describes how to allocate offsets from MemoryFile. 46 type Direction int 47 48 const ( 49 // BottomUp allocates offsets in increasing offsets. 50 BottomUp Direction = iota 51 // TopDown allocates offsets in decreasing offsets. 52 TopDown 53 ) 54 55 // String implements fmt.Stringer. 56 func (d Direction) String() string { 57 switch d { 58 case BottomUp: 59 return "up" 60 case TopDown: 61 return "down" 62 } 63 panic(fmt.Sprintf("invalid direction: %d", d)) 64 } 65 66 // MemoryFile is a memmap.File whose pages may be allocated to arbitrary 67 // users. 68 type MemoryFile struct { 69 memmap.NoBufferedIOFallback 70 71 // opts holds options passed to NewMemoryFile. opts is immutable. 72 opts MemoryFileOpts 73 74 // MemoryFile owns a single backing file, which is modeled as follows: 75 // 76 // Each page in the file can be committed or uncommitted. A page is 77 // committed if the host kernel is spending resources to store its contents 78 // and uncommitted otherwise. This definition includes pages that the host 79 // kernel has swapped; this is intentional, to ensure that accounting does 80 // not change even if host kernel swapping behavior changes, and that 81 // memory used by pseudo-swap mechanisms like zswap is still accounted. 82 // 83 // The initial contents of uncommitted pages are implicitly zero bytes. A 84 // read or write to the contents of an uncommitted page causes it to be 85 // committed. This is the only event that can cause a uncommitted page to 86 // be committed. 87 // 88 // fallocate(FALLOC_FL_PUNCH_HOLE) (MemoryFile.Decommit) causes committed 89 // pages to be uncommitted. This is the only event that can cause a 90 // committed page to be uncommitted. 91 // 92 // Memory accounting is based on identifying the set of committed pages. 93 // Since we do not have direct access to the MMU, tracking reads and writes 94 // to uncommitted pages to detect commitment would introduce additional 95 // page faults, which would be prohibitively expensive. Instead, we query 96 // the host kernel to determine which pages are committed. 97 98 // file is the backing file. The file pointer is immutable. 99 file *os.File 100 101 mu memoryFileMutex 102 103 // usage maps each page in the file to metadata for that page. Pages for 104 // which no segment exists in usage are both unallocated (not in use) and 105 // uncommitted. 106 // 107 // Since usage stores usageInfo objects by value, clients should usually 108 // use usageIterator.ValuePtr() instead of usageIterator.Value() to get a 109 // pointer to the usageInfo rather than a copy. 110 // 111 // usage must be kept maximally merged (that is, there should never be two 112 // adjacent segments with the same values). At least markReclaimed depends 113 // on this property. 114 // 115 // usage is protected by mu. 116 usage usageSet 117 118 // The UpdateUsage function scans all segments with knownCommitted set 119 // to false, sees which pages are committed and creates corresponding 120 // segments with knownCommitted set to true. 121 // 122 // In order to avoid unnecessary scans, usageExpected tracks the total 123 // file blocks expected. This is used to elide the scan when this 124 // matches the underlying file blocks. 125 // 126 // To track swapped pages, usageSwapped tracks the discrepancy between 127 // what is observed in core and what is reported by the file. When 128 // usageSwapped is non-zero, a sweep will be performed at least every 129 // second. The start of the last sweep is recorded in usageLast. 130 // 131 // All usage attributes are all protected by mu. 132 usageExpected uint64 133 usageSwapped uint64 134 usageLast time.Time 135 136 // fileSize is the size of the backing memory file in bytes. fileSize is 137 // always a power-of-two multiple of chunkSize. 138 // 139 // fileSize is protected by mu. 140 fileSize int64 141 142 // Pages from the backing file are mapped into the local address space on 143 // the granularity of large pieces called chunks. mappings is a []uintptr 144 // that stores, for each chunk, the start address of a mapping of that 145 // chunk in the current process' address space, or 0 if no such mapping 146 // exists. Once a chunk is mapped, it is never remapped or unmapped until 147 // the MemoryFile is destroyed. 148 // 149 // Mutating the mappings slice or its contents requires both holding 150 // mappingsMu and using atomic memory operations. (The slice is mutated 151 // whenever the file is expanded. Per the above, the only permitted 152 // mutation of the slice's contents is the assignment of a mapping to a 153 // chunk that was previously unmapped.) Reading the slice or its contents 154 // only requires *either* holding mappingsMu or using atomic memory 155 // operations. This allows MemoryFile.MapInternal to avoid locking in the 156 // common case where chunk mappings already exist. 157 mappingsMu mappingsMutex 158 mappings atomic.Pointer[[]uintptr] 159 160 // destroyed is set by Destroy to instruct the reclaimer goroutine to 161 // release resources and exit. destroyed is protected by mu. 162 destroyed bool 163 164 // reclaimable is true if usage may contain reclaimable pages. reclaimable 165 // is protected by mu. 166 reclaimable bool 167 168 // reclaim is the collection of regions for reclaim. reclaim is protected 169 // by mu. 170 reclaim reclaimSet 171 172 // reclaimCond is signaled (with mu locked) when reclaimable or destroyed 173 // transitions from false to true. 174 reclaimCond sync.Cond 175 176 // evictable maps EvictableMemoryUsers to eviction state. 177 // 178 // evictable is protected by mu. 179 evictable map[EvictableMemoryUser]*evictableMemoryUserInfo 180 181 // evictionWG counts the number of goroutines currently performing evictions. 182 evictionWG sync.WaitGroup 183 184 // stopNotifyPressure stops memory cgroup pressure level 185 // notifications used to drive eviction. stopNotifyPressure is 186 // immutable. 187 stopNotifyPressure func() 188 189 // savable is true if this MemoryFile will be saved via SaveTo() during 190 // the kernel's SaveTo operation. savable is protected by mu. 191 savable bool 192 } 193 194 // MemoryFileOpts provides options to NewMemoryFile. 195 type MemoryFileOpts struct { 196 // DelayedEviction controls the extent to which the MemoryFile may delay 197 // eviction of evictable allocations. 198 DelayedEviction DelayedEvictionType 199 200 // If UseHostMemcgPressure is true, use host memory cgroup pressure level 201 // notifications to determine when eviction is necessary. This option has 202 // no effect unless DelayedEviction is DelayedEvictionEnabled. 203 UseHostMemcgPressure bool 204 205 // DecommitOnDestroy indicates whether the entire host file should be 206 // decommitted on destruction. This is appropriate for host filesystem based 207 // files that need to be explicitly cleaned up to release disk space. 208 DecommitOnDestroy bool 209 210 // If ManualZeroing is true, MemoryFile must not assume that new pages 211 // obtained from the host are zero-filled, such that MemoryFile must manually 212 // zero newly-allocated pages. 213 ManualZeroing bool 214 215 // If DisableIMAWorkAround is true, NewMemoryFile will not call 216 // IMAWorkAroundForMemFile(). 217 DisableIMAWorkAround bool 218 219 // DiskBackedFile indicates that the MemoryFile is backed by a file on disk. 220 DiskBackedFile bool 221 222 // RestoreID is an opaque string used to reassociate the MemoryFile with its 223 // replacement during restore. 224 RestoreID string 225 226 // EnforceMaximumAllocatable is a flag that governs whether the MemoryFile 227 // will be limited in size of total allocations by 228 // usage.MaximumAllocatableBytes. 229 EnforceMaximumAllocatable bool 230 } 231 232 // DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction. 233 type DelayedEvictionType int 234 235 const ( 236 // DelayedEvictionDefault has unspecified behavior. 237 DelayedEvictionDefault DelayedEvictionType = iota 238 239 // DelayedEvictionDisabled requires that evictable allocations are evicted 240 // as soon as possible. 241 DelayedEvictionDisabled 242 243 // DelayedEvictionEnabled requests that the MemoryFile delay eviction of 244 // evictable allocations until doing so is considered necessary to avoid 245 // performance degradation due to host memory pressure, or OOM kills. 246 // 247 // As of this writing, the behavior of DelayedEvictionEnabled depends on 248 // whether or not MemoryFileOpts.UseHostMemcgPressure is enabled: 249 // 250 // - If UseHostMemcgPressure is true, evictions are delayed until memory 251 // pressure is indicated. 252 // 253 // - Otherwise, evictions are only delayed until the reclaimer goroutine 254 // is out of work (pages to reclaim). 255 DelayedEvictionEnabled 256 257 // DelayedEvictionManual requires that evictable allocations are only 258 // evicted when MemoryFile.StartEvictions() is called. This is extremely 259 // dangerous outside of tests. 260 DelayedEvictionManual 261 ) 262 263 // usageInfo tracks usage information. 264 // 265 // +stateify savable 266 type usageInfo struct { 267 // kind is the usage kind. 268 kind usage.MemoryKind 269 270 // knownCommitted is true if the tracked region is definitely committed. 271 // (If it is false, the tracked region may or may not be committed.) 272 knownCommitted bool 273 274 refs uint64 275 276 // memCgID is the memory cgroup id to which this page is committed. 277 memCgID uint32 278 } 279 280 // An EvictableMemoryUser represents a user of MemoryFile-allocated memory that 281 // may be asked to deallocate that memory in the presence of memory pressure. 282 type EvictableMemoryUser interface { 283 // Evict requests that the EvictableMemoryUser deallocate memory used by 284 // er, which was registered as evictable by a previous call to 285 // MemoryFile.MarkEvictable. 286 // 287 // Evict is not required to deallocate memory. In particular, since pgalloc 288 // must call Evict without holding locks to avoid circular lock ordering, 289 // it is possible that the passed range has already been marked as 290 // unevictable by a racing call to MemoryFile.MarkUnevictable. 291 // Implementations of EvictableMemoryUser must detect such races and handle 292 // them by making Evict have no effect on unevictable ranges. 293 // 294 // After a call to Evict, the MemoryFile will consider the evicted range 295 // unevictable (i.e. it will not call Evict on the same range again) until 296 // informed otherwise by a subsequent call to MarkEvictable. 297 Evict(ctx context.Context, er EvictableRange) 298 } 299 300 // An EvictableRange represents a range of uint64 offsets in an 301 // EvictableMemoryUser. 302 // 303 // In practice, most EvictableMemoryUsers will probably be implementations of 304 // memmap.Mappable, and EvictableRange therefore corresponds to 305 // memmap.MappableRange. However, this package cannot depend on the memmap 306 // package, since doing so would create a circular dependency. 307 // 308 // type EvictableRange <generated using go_generics> 309 310 // evictableMemoryUserInfo is the value type of MemoryFile.evictable. 311 type evictableMemoryUserInfo struct { 312 // ranges tracks all evictable ranges for the given user. 313 ranges evictableRangeSet 314 315 // If evicting is true, there is a goroutine currently evicting all 316 // evictable ranges for this user. 317 evicting bool 318 } 319 320 const ( 321 chunkShift = 30 322 chunkSize = 1 << chunkShift // 1 GB 323 chunkMask = chunkSize - 1 324 325 // maxPage is the highest 64-bit page. 326 maxPage = math.MaxUint64 &^ (hostarch.PageSize - 1) 327 ) 328 329 // NewMemoryFile creates a MemoryFile backed by the given file. If 330 // NewMemoryFile succeeds, ownership of file is transferred to the returned 331 // MemoryFile. 332 func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) { 333 switch opts.DelayedEviction { 334 case DelayedEvictionDefault: 335 opts.DelayedEviction = DelayedEvictionEnabled 336 case DelayedEvictionDisabled, DelayedEvictionManual: 337 opts.UseHostMemcgPressure = false 338 case DelayedEvictionEnabled: 339 // ok 340 default: 341 return nil, fmt.Errorf("invalid MemoryFileOpts.DelayedEviction: %v", opts.DelayedEviction) 342 } 343 344 // Truncate the file to 0 bytes first to ensure that it's empty. 345 if err := file.Truncate(0); err != nil { 346 return nil, err 347 } 348 f := &MemoryFile{ 349 opts: opts, 350 file: file, 351 evictable: make(map[EvictableMemoryUser]*evictableMemoryUserInfo), 352 } 353 f.mappings.Store(&[]uintptr{}) 354 f.reclaimCond.L = &f.mu 355 356 if f.opts.DelayedEviction == DelayedEvictionEnabled && f.opts.UseHostMemcgPressure { 357 stop, err := hostmm.NotifyCurrentMemcgPressureCallback(func() { 358 f.mu.Lock() 359 startedAny := f.startEvictionsLocked() 360 f.mu.Unlock() 361 if startedAny { 362 log.Debugf("pgalloc.MemoryFile performing evictions due to memcg pressure") 363 } 364 }, "low") 365 if err != nil { 366 return nil, fmt.Errorf("failed to configure memcg pressure level notifications: %v", err) 367 } 368 f.stopNotifyPressure = stop 369 } 370 371 go f.runReclaim() // S/R-SAFE: f.mu 372 373 if !opts.DisableIMAWorkAround { 374 IMAWorkAroundForMemFile(file.Fd()) 375 } 376 return f, nil 377 } 378 379 // IMAWorkAroundForMemFile works around IMA by immediately creating a temporary 380 // PROT_EXEC mapping, while the backing file is still small. IMA will ignore 381 // any future mappings. 382 // 383 // The Linux kernel contains an optional feature called "Integrity 384 // Measurement Architecture" (IMA). If IMA is enabled, it will checksum 385 // binaries the first time they are mapped PROT_EXEC. This is bad news for 386 // executable pages mapped from our backing file, which can grow to 387 // terabytes in (sparse) size. If IMA attempts to checksum a file that 388 // large, it will allocate all of the sparse pages and quickly exhaust all 389 // memory. 390 func IMAWorkAroundForMemFile(fd uintptr) { 391 m, _, errno := unix.Syscall6( 392 unix.SYS_MMAP, 393 0, 394 hostarch.PageSize, 395 unix.PROT_EXEC, 396 unix.MAP_SHARED, 397 fd, 398 0) 399 if errno != 0 { 400 // This isn't fatal (IMA may not even be in use). Log the error, but 401 // don't return it. 402 log.Warningf("Failed to pre-map MemoryFile PROT_EXEC: %v", errno) 403 } else { 404 if _, _, errno := unix.Syscall( 405 unix.SYS_MUNMAP, 406 m, 407 hostarch.PageSize, 408 0); errno != 0 { 409 panic(fmt.Sprintf("failed to unmap PROT_EXEC MemoryFile mapping: %v", errno)) 410 } 411 } 412 } 413 414 // Destroy releases all resources used by f. 415 // 416 // Preconditions: All pages allocated by f have been freed. 417 // 418 // Postconditions: None of f's methods may be called after Destroy. 419 func (f *MemoryFile) Destroy() { 420 f.mu.Lock() 421 defer f.mu.Unlock() 422 f.destroyed = true 423 f.reclaimCond.Signal() 424 } 425 426 // AllocationMode provides a way to inform the pgalloc API how to allocate 427 // memory and pages on the host. 428 // A page will exist in one of the following incremental states: 429 // 1. Allocated: A page is allocated if it was returned by Allocate() and its 430 // reference count hasn't dropped to 0 since then. 431 // 2. Committed: As described in MemoryFile documentation above, a page is 432 // committed if the host kernel is spending resources to store its 433 // contents. A committed page is implicitly allocated. 434 // 3. Populated: A page is populated for reading/writing in a page table 435 // hierarchy if it has a page table entry that permits reading/writing 436 // respectively. A populated page is implicitly committed, since the page 437 // table entry needs a physical page to point to, but not vice versa. 438 type AllocationMode int 439 440 const ( 441 // AllocateOnly indicates that pages need to only be allocated. 442 AllocateOnly AllocationMode = iota 443 // AllocateAndCommit indicates that pages need to be committed, in addition 444 // to being allocated. 445 AllocateAndCommit 446 // AllocateAndWritePopulate indicates that writable pages should ideally be 447 // populated in the page table, in addition to being allocated. This is a 448 // suggestion, not a requirement. 449 AllocateAndWritePopulate 450 ) 451 452 // AllocOpts are options used in MemoryFile.Allocate. 453 type AllocOpts struct { 454 // Kind is the memory kind to be used for accounting. 455 Kind usage.MemoryKind 456 // Dir indicates the direction in which offsets are allocated. 457 Dir Direction 458 // MemCgID is the memory cgroup ID and the zero value indicates that 459 // the memory will not be accounted to any cgroup. 460 MemCgID uint32 461 // Mode allows the callers to select how the pages are allocated in the 462 // MemoryFile. Callers that will fill the allocated memory by writing to it 463 // should pass AllocateAndWritePopulate to avoid faulting page-by-page. Callers 464 // that will fill the allocated memory by invoking host system calls should 465 // pass AllocateOnly. 466 Mode AllocationMode 467 // If ReaderFunc is provided, the allocated memory is filled by calling it 468 // repeatedly until either length bytes are read or a non-nil error is 469 // returned. It returns the allocated memory, truncated down to the nearest 470 // page. If this is shorter than length bytes due to an error returned by 471 // ReaderFunc, it returns the partially filled fr and error. 472 ReaderFunc safemem.ReaderFunc 473 } 474 475 // Allocate returns a range of initially-zeroed pages of the given length with 476 // the given accounting kind and a single reference held by the caller. When 477 // the last reference on an allocated page is released, ownership of the page 478 // is returned to the MemoryFile, allowing it to be returned by a future call 479 // to Allocate. 480 // 481 // Preconditions: length must be page-aligned and non-zero. 482 func (f *MemoryFile) Allocate(length uint64, opts AllocOpts) (memmap.FileRange, error) { 483 fr, err := f.allocate(length, &opts) 484 if err != nil { 485 return memmap.FileRange{}, err 486 } 487 var dsts safemem.BlockSeq 488 switch opts.Mode { 489 case AllocateOnly: // Allocation is handled above. Nothing more to do. 490 case AllocateAndCommit: 491 if err := f.commitFile(fr); err != nil { 492 f.DecRef(fr) 493 return memmap.FileRange{}, err 494 } 495 case AllocateAndWritePopulate: 496 dsts, err = f.MapInternal(fr, hostarch.Write) 497 if err != nil { 498 f.DecRef(fr) 499 return memmap.FileRange{}, err 500 } 501 if canPopulate() { 502 rem := dsts 503 for { 504 if !tryPopulate(rem.Head()) { 505 break 506 } 507 rem = rem.Tail() 508 if rem.IsEmpty() { 509 break 510 } 511 } 512 } 513 default: 514 panic(fmt.Sprintf("unknown allocation mode: %d", opts.Mode)) 515 } 516 if opts.ReaderFunc != nil { 517 if dsts.IsEmpty() { 518 dsts, err = f.MapInternal(fr, hostarch.Write) 519 if err != nil { 520 f.DecRef(fr) 521 return memmap.FileRange{}, err 522 } 523 } 524 n, err := safemem.ReadFullToBlocks(opts.ReaderFunc, dsts) 525 un := uint64(hostarch.Addr(n).RoundDown()) 526 if un < length { 527 // Free unused memory and update fr to contain only the memory that is 528 // still allocated. 529 f.DecRef(memmap.FileRange{fr.Start + un, fr.End}) 530 fr.End = fr.Start + un 531 } 532 if err != nil { 533 return fr, err 534 } 535 } 536 return fr, nil 537 } 538 539 func (f *MemoryFile) allocate(length uint64, opts *AllocOpts) (memmap.FileRange, error) { 540 if length == 0 || length%hostarch.PageSize != 0 { 541 panic(fmt.Sprintf("invalid allocation length: %#x", length)) 542 } 543 544 f.mu.Lock() 545 defer f.mu.Unlock() 546 547 if !f.hasSpaceToAllocate(length) { 548 log.Debugf("Enforcing memory limit on allocation of size %d, max is %d, already have %d", length, usage.MaximumAllocatableBytes, f.usageExpected) 549 return memmap.FileRange{}, linuxerr.ENOMEM 550 } 551 552 // Align hugepage-and-larger allocations on hugepage boundaries to try 553 // to take advantage of hugetmpfs. 554 alignment := uint64(hostarch.PageSize) 555 if length >= hostarch.HugePageSize { 556 alignment = hostarch.HugePageSize 557 } 558 559 // Find a range in the underlying file. 560 fr, ok := f.findAvailableRange(length, alignment, opts.Dir) 561 if !ok { 562 return memmap.FileRange{}, linuxerr.ENOMEM 563 } 564 565 // Expand the file if needed. 566 if int64(fr.End) > f.fileSize { 567 // Round the new file size up to be chunk-aligned. 568 newFileSize := (int64(fr.End) + chunkMask) &^ chunkMask 569 if err := f.file.Truncate(newFileSize); err != nil { 570 return memmap.FileRange{}, err 571 } 572 f.fileSize = newFileSize 573 f.mappingsMu.Lock() 574 oldMappings := *f.mappings.Load() 575 newMappings := make([]uintptr, newFileSize>>chunkShift) 576 copy(newMappings, oldMappings) 577 f.mappings.Store(&newMappings) 578 f.mappingsMu.Unlock() 579 } 580 581 if f.opts.ManualZeroing { 582 if err := f.manuallyZero(fr); err != nil { 583 return memmap.FileRange{}, err 584 } 585 } 586 // Mark selected pages as in use. 587 f.usage.InsertRange(fr, usageInfo{ 588 kind: opts.Kind, 589 refs: 1, 590 memCgID: opts.MemCgID, 591 }) 592 593 return fr, nil 594 } 595 596 func (f *MemoryFile) hasSpaceToAllocate(length uint64) bool { 597 if f.opts.EnforceMaximumAllocatable && usage.MaximumAllocatableBytes != 0 && ((f.usageExpected+length) > usage.MaximumAllocatableBytes || (f.usageExpected+length) < f.usageExpected) { 598 // f.usageExpected is not guaranteed to be correct because it is 599 // updated only when f.UpdateUsage is called periodically. 600 // To eliminate false-positives double check against the exact 601 // measure; we don't care as much about false-negatives, which 602 // helps avoid a host-syscall via f.TotalUsage in the happy-path. 603 exactUsage, err := f.TotalUsage() 604 if err != nil { 605 log.Warningf("Failed to fetch total usage for memory file: %v", err) 606 return false 607 } 608 if (exactUsage+length) > usage.MaximumAllocatableBytes || (exactUsage+length) < exactUsage { 609 return false 610 } 611 } 612 return true 613 } 614 615 // findAvailableRange returns an available range in the usageSet. 616 // 617 // Note that scanning for available slots takes place from end first backwards, 618 // then forwards. This heuristic has important consequence for how sequential 619 // mappings can be merged in the host VMAs, given that addresses for both 620 // application and sentry mappings are allocated top-down (from higher to 621 // lower addresses). The file is also grown exponentially in order to create 622 // space for mappings to be allocated downwards. 623 // 624 // Precondition: alignment must be a power of 2. 625 func (f *MemoryFile) findAvailableRange(length, alignment uint64, dir Direction) (memmap.FileRange, bool) { 626 if dir == BottomUp { 627 return findAvailableRangeBottomUp(&f.usage, length, alignment) 628 } 629 return findAvailableRangeTopDown(&f.usage, f.fileSize, length, alignment) 630 } 631 632 func findAvailableRangeTopDown(usage *usageSet, fileSize int64, length, alignment uint64) (memmap.FileRange, bool) { 633 alignmentMask := alignment - 1 634 635 // Search for space in existing gaps, starting at the current end of the 636 // file and working backward. 637 lastGap := usage.LastGap() 638 gap := lastGap 639 for { 640 end := gap.End() 641 if end > uint64(fileSize) { 642 end = uint64(fileSize) 643 } 644 645 // Try to allocate from the end of this gap, with the start of the 646 // allocated range aligned down to alignment. 647 unalignedStart := end - length 648 if unalignedStart > end { 649 // Negative overflow: this and all preceding gaps are too small to 650 // accommodate length. 651 break 652 } 653 if start := unalignedStart &^ alignmentMask; start >= gap.Start() { 654 return memmap.FileRange{start, start + length}, true 655 } 656 657 gap = gap.PrevLargeEnoughGap(length) 658 if !gap.Ok() { 659 break 660 } 661 } 662 663 // Check that it's possible to fit this allocation at the end of a file of any size. 664 min := lastGap.Start() 665 min = (min + alignmentMask) &^ alignmentMask 666 if min+length < min { 667 // Overflow: allocation would exceed the range of uint64. 668 return memmap.FileRange{}, false 669 } 670 671 // Determine the minimum file size required to fit this allocation at its end. 672 for { 673 newFileSize := 2 * fileSize 674 if newFileSize <= fileSize { 675 if fileSize != 0 { 676 // Overflow: allocation would exceed the range of int64. 677 return memmap.FileRange{}, false 678 } 679 newFileSize = chunkSize 680 } 681 fileSize = newFileSize 682 683 unalignedStart := uint64(fileSize) - length 684 if unalignedStart > uint64(fileSize) { 685 // Negative overflow: fileSize is still inadequate. 686 continue 687 } 688 if start := unalignedStart &^ alignmentMask; start >= min { 689 return memmap.FileRange{start, start + length}, true 690 } 691 } 692 } 693 694 func findAvailableRangeBottomUp(usage *usageSet, length, alignment uint64) (memmap.FileRange, bool) { 695 alignmentMask := alignment - 1 696 for gap := usage.FirstGap(); gap.Ok(); gap = gap.NextLargeEnoughGap(length) { 697 // Align the start address and check if allocation still fits in the gap. 698 start := (gap.Start() + alignmentMask) &^ alignmentMask 699 700 // File offsets are int64s. Since length must be strictly positive, end 701 // cannot legitimately be 0. 702 end := start + length 703 if end < start || int64(end) <= 0 { 704 return memmap.FileRange{}, false 705 } 706 if end <= gap.End() { 707 return memmap.FileRange{start, end}, true 708 } 709 } 710 711 // NextLargeEnoughGap should have returned a gap at the end. 712 panic(fmt.Sprintf("NextLargeEnoughGap didn't return a gap at the end, length: %d", length)) 713 } 714 715 var mlockDisabled atomicbitops.Uint32 716 var madvPopulateWriteDisabled atomicbitops.Uint32 717 718 func canPopulate() bool { 719 return mlockDisabled.Load() == 0 || madvPopulateWriteDisabled.Load() == 0 720 } 721 722 func tryPopulateMadv(b safemem.Block) bool { 723 if madvPopulateWriteDisabled.Load() != 0 { 724 return false 725 } 726 start, ok := hostarch.Addr(b.Addr()).RoundUp() 727 if !ok { 728 return true 729 } 730 end := hostarch.Addr(b.Addr() + uintptr(b.Len())).RoundDown() 731 bLen := end - start 732 // Only call madvise(MADV_POPULATE_WRITE) if >=2 pages are being populated. 733 // 1 syscall overhead >= 1 page fault overhead. This is because syscalls are 734 // susceptible to additional overheads like seccomp-bpf filters and auditing. 735 if start >= end || bLen <= hostarch.PageSize { 736 return true 737 } 738 _, _, errno := unix.RawSyscall(unix.SYS_MADVISE, uintptr(start), uintptr(bLen), unix.MADV_POPULATE_WRITE) 739 if errno != 0 { 740 if errno == unix.EINVAL { 741 // EINVAL is expected if MADV_POPULATE_WRITE is not supported (Linux <5.14). 742 log.Infof("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: madvise failed: %s", errno) 743 } else { 744 log.Warningf("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: madvise failed: %s", errno) 745 } 746 madvPopulateWriteDisabled.Store(1) 747 return false 748 } 749 return true 750 } 751 752 func tryPopulateMlock(b safemem.Block) bool { 753 if mlockDisabled.Load() != 0 { 754 return false 755 } 756 // Call mlock to populate pages, then munlock to cancel the mlock (but keep 757 // the pages populated). Only do so for hugepage-aligned address ranges to 758 // ensure that splitting the VMA in mlock doesn't split any existing 759 // hugepages. This assumes that two host syscalls, plus the MM overhead of 760 // mlock + munlock, is faster on average than trapping for 761 // HugePageSize/PageSize small page faults. 762 start, ok := hostarch.Addr(b.Addr()).HugeRoundUp() 763 if !ok { 764 return true 765 } 766 end := hostarch.Addr(b.Addr() + uintptr(b.Len())).HugeRoundDown() 767 if start >= end { 768 return true 769 } 770 _, _, errno := unix.Syscall(unix.SYS_MLOCK, uintptr(start), uintptr(end-start), 0) 771 unix.RawSyscall(unix.SYS_MUNLOCK, uintptr(start), uintptr(end-start), 0) 772 if errno != 0 { 773 if errno == unix.ENOMEM || errno == unix.EPERM { 774 // These errors are expected from hitting non-zero RLIMIT_MEMLOCK, or 775 // hitting zero RLIMIT_MEMLOCK without CAP_IPC_LOCK, respectively. 776 log.Infof("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: mlock failed: %s", errno) 777 } else { 778 log.Warningf("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: mlock failed: %s", errno) 779 } 780 mlockDisabled.Store(1) 781 return false 782 } 783 return true 784 } 785 786 func tryPopulate(b safemem.Block) bool { 787 // There are two approaches for populating writable pages: 788 // 1. madvise(MADV_POPULATE_WRITE). It has the desired effect: "Populate 789 // (prefault) page tables writable, faulting in all pages in the range 790 // just as if manually writing to each each page". 791 // 2. Call mlock to populate pages, then munlock to cancel the mlock (but 792 // keep the pages populated). 793 // 794 // Prefer the madvise(MADV_POPULATE_WRITE) approach because: 795 // - Only requires 1 syscall, as opposed to 2 syscalls with mlock approach. 796 // - It is faster because it doesn't have to modify vmas like mlock does. 797 // - It works for disk-backed memory mappings too. The mlock approach doesn't 798 // work for disk-backed filesystems (e.g. ext4). This is because 799 // mlock(2) => mm/gup.c:__mm_populate() emulates a read fault on writable 800 // MAP_SHARED mappings. For memory-backed (shmem) files, 801 // mm/mmap.c:vma_set_page_prot() => vma_wants_writenotify() is false, so 802 // the page table entries populated by a read fault are writable. For 803 // disk-backed files, vma_set_page_prot() => vma_wants_writenotify() is 804 // true, so the page table entries populated by a read fault are read-only. 805 if tryPopulateMadv(b) { 806 return true 807 } 808 return tryPopulateMlock(b) 809 } 810 811 // fallocate(2) modes, defined in Linux's include/uapi/linux/falloc.h. 812 const ( 813 _FALLOC_FL_KEEP_SIZE = 1 814 _FALLOC_FL_PUNCH_HOLE = 2 815 ) 816 817 // Decommit releases resources associated with maintaining the contents of the 818 // given pages. If Decommit succeeds, future accesses of the decommitted pages 819 // will read zeroes. 820 // 821 // Preconditions: fr.Length() > 0. 822 func (f *MemoryFile) Decommit(fr memmap.FileRange) error { 823 if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 { 824 panic(fmt.Sprintf("invalid range: %v", fr)) 825 } 826 827 if f.opts.ManualZeroing { 828 // FALLOC_FL_PUNCH_HOLE may not zero pages if ManualZeroing is in 829 // effect. 830 if err := f.manuallyZero(fr); err != nil { 831 return err 832 } 833 } else { 834 if err := f.decommitFile(fr); err != nil { 835 return err 836 } 837 } 838 839 f.markDecommitted(fr) 840 return nil 841 } 842 843 func (f *MemoryFile) manuallyZero(fr memmap.FileRange) error { 844 return f.forEachMappingSlice(fr, func(bs []byte) { 845 clear(bs) 846 }) 847 } 848 849 func (f *MemoryFile) commitFile(fr memmap.FileRange) error { 850 // "The default operation (i.e., mode is zero) of fallocate() allocates the 851 // disk space within the range specified by offset and len." - fallocate(2) 852 return unix.Fallocate( 853 int(f.file.Fd()), 854 0, // mode 855 int64(fr.Start), 856 int64(fr.Length())) 857 } 858 859 func (f *MemoryFile) decommitFile(fr memmap.FileRange) error { 860 // "After a successful call, subsequent reads from this range will 861 // return zeroes. The FALLOC_FL_PUNCH_HOLE flag must be ORed with 862 // FALLOC_FL_KEEP_SIZE in mode ..." - fallocate(2) 863 return unix.Fallocate( 864 int(f.file.Fd()), 865 _FALLOC_FL_PUNCH_HOLE|_FALLOC_FL_KEEP_SIZE, 866 int64(fr.Start), 867 int64(fr.Length())) 868 } 869 870 func (f *MemoryFile) markDecommitted(fr memmap.FileRange) { 871 f.mu.Lock() 872 defer f.mu.Unlock() 873 // Since we're changing the knownCommitted attribute, we need to merge 874 // across the entire range to ensure that the usage tree is minimal. 875 f.usage.MutateFullRange(fr, func(seg usageIterator) bool { 876 val := seg.ValuePtr() 877 if val.knownCommitted { 878 // Drop the usageExpected appropriately. 879 amount := seg.Range().Length() 880 usage.MemoryAccounting.Dec(amount, val.kind, val.memCgID) 881 f.usageExpected -= amount 882 val.knownCommitted = false 883 } 884 val.memCgID = 0 885 return true 886 }) 887 } 888 889 // HasUniqueRef returns true if all pages in the given range have exactly one 890 // reference. A return value of false is inherently racy, but if the caller 891 // holds a reference on the given range and is preventing other goroutines from 892 // copying it, then a return value of true is not racy. 893 // 894 // Preconditions: At least one reference must be held on all pages in fr. 895 func (f *MemoryFile) HasUniqueRef(fr memmap.FileRange) bool { 896 f.mu.Lock() 897 defer f.mu.Unlock() 898 hasUniqueRef := true 899 f.usage.VisitFullRange(fr, func(seg usageIterator) bool { 900 if seg.ValuePtr().refs != 1 { 901 hasUniqueRef = false 902 return false 903 } 904 return true 905 }) 906 return hasUniqueRef 907 } 908 909 // IncRef implements memmap.File.IncRef. 910 func (f *MemoryFile) IncRef(fr memmap.FileRange, memCgID uint32) { 911 if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 { 912 panic(fmt.Sprintf("invalid range: %v", fr)) 913 } 914 915 f.mu.Lock() 916 defer f.mu.Unlock() 917 918 f.usage.MutateFullRange(fr, func(seg usageIterator) bool { 919 seg.ValuePtr().refs++ 920 return true 921 }) 922 } 923 924 // DecRef implements memmap.File.DecRef. 925 func (f *MemoryFile) DecRef(fr memmap.FileRange) { 926 if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 { 927 panic(fmt.Sprintf("invalid range: %v", fr)) 928 } 929 930 var freed bool 931 932 f.mu.Lock() 933 defer f.mu.Unlock() 934 935 f.usage.MutateFullRange(fr, func(seg usageIterator) bool { 936 val := seg.ValuePtr() 937 if val.refs == 0 { 938 panic(fmt.Sprintf("DecRef(%v): 0 existing references on %v:\n%v", fr, seg.Range(), &f.usage)) 939 } 940 val.refs-- 941 if val.refs == 0 { 942 f.reclaim.InsertRange(seg.Range(), reclaimSetValue{}) 943 freed = true 944 // Reclassify memory as System, until it's freed by the reclaim 945 // goroutine. 946 if val.knownCommitted { 947 usage.MemoryAccounting.Move(seg.Range().Length(), usage.System, val.kind, val.memCgID) 948 } 949 val.kind = usage.System 950 } 951 return true 952 }) 953 954 if freed { 955 f.reclaimable = true 956 f.reclaimCond.Signal() 957 } 958 } 959 960 // MapInternal implements memmap.File.MapInternal. 961 func (f *MemoryFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { 962 if !fr.WellFormed() || fr.Length() == 0 { 963 panic(fmt.Sprintf("invalid range: %v", fr)) 964 } 965 if at.Execute { 966 return safemem.BlockSeq{}, linuxerr.EACCES 967 } 968 969 chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift) 970 if chunks == 1 { 971 // Avoid an unnecessary slice allocation. 972 var seq safemem.BlockSeq 973 err := f.forEachMappingSlice(fr, func(bs []byte) { 974 seq = safemem.BlockSeqOf(safemem.BlockFromSafeSlice(bs)) 975 }) 976 return seq, err 977 } 978 blocks := make([]safemem.Block, 0, chunks) 979 err := f.forEachMappingSlice(fr, func(bs []byte) { 980 blocks = append(blocks, safemem.BlockFromSafeSlice(bs)) 981 }) 982 return safemem.BlockSeqFromSlice(blocks), err 983 } 984 985 // forEachMappingSlice invokes fn on a sequence of byte slices that 986 // collectively map all bytes in fr. 987 func (f *MemoryFile) forEachMappingSlice(fr memmap.FileRange, fn func([]byte)) error { 988 mappings := *f.mappings.Load() 989 for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize { 990 chunk := int(chunkStart >> chunkShift) 991 m := atomic.LoadUintptr(&mappings[chunk]) 992 if m == 0 { 993 var err error 994 mappings, m, err = f.getChunkMapping(chunk) 995 if err != nil { 996 return err 997 } 998 } 999 startOff := uint64(0) 1000 if chunkStart < fr.Start { 1001 startOff = fr.Start - chunkStart 1002 } 1003 endOff := uint64(chunkSize) 1004 if chunkStart+chunkSize > fr.End { 1005 endOff = fr.End - chunkStart 1006 } 1007 fn(unsafeSlice(m, chunkSize)[startOff:endOff]) 1008 } 1009 return nil 1010 } 1011 1012 func (f *MemoryFile) getChunkMapping(chunk int) ([]uintptr, uintptr, error) { 1013 f.mappingsMu.Lock() 1014 defer f.mappingsMu.Unlock() 1015 // Another thread may have replaced f.mappings altogether due to file 1016 // expansion. 1017 mappings := *f.mappings.Load() 1018 // Another thread may have already mapped the chunk. 1019 if m := mappings[chunk]; m != 0 { 1020 return mappings, m, nil 1021 } 1022 m, _, errno := unix.Syscall6( 1023 unix.SYS_MMAP, 1024 0, 1025 chunkSize, 1026 unix.PROT_READ|unix.PROT_WRITE, 1027 unix.MAP_SHARED, 1028 f.file.Fd(), 1029 uintptr(chunk<<chunkShift)) 1030 if errno != 0 { 1031 return nil, 0, errno 1032 } 1033 atomic.StoreUintptr(&mappings[chunk], m) 1034 return mappings, m, nil 1035 } 1036 1037 // MarkEvictable allows f to request memory deallocation by calling 1038 // user.Evict(er) in the future. 1039 // 1040 // Redundantly marking an already-evictable range as evictable has no effect. 1041 func (f *MemoryFile) MarkEvictable(user EvictableMemoryUser, er EvictableRange) { 1042 f.mu.Lock() 1043 defer f.mu.Unlock() 1044 info, ok := f.evictable[user] 1045 if !ok { 1046 info = &evictableMemoryUserInfo{} 1047 f.evictable[user] = info 1048 } 1049 gap := info.ranges.LowerBoundGap(er.Start) 1050 for gap.Ok() && gap.Start() < er.End { 1051 gapER := gap.Range().Intersect(er) 1052 if gapER.Length() == 0 { 1053 gap = gap.NextGap() 1054 continue 1055 } 1056 gap = info.ranges.Insert(gap, gapER, evictableRangeSetValue{}).NextGap() 1057 } 1058 if !info.evicting { 1059 switch f.opts.DelayedEviction { 1060 case DelayedEvictionDisabled: 1061 // Kick off eviction immediately. 1062 f.startEvictionGoroutineLocked(user, info) 1063 case DelayedEvictionEnabled: 1064 if !f.opts.UseHostMemcgPressure { 1065 // Ensure that the reclaimer goroutine is running, so that it 1066 // can start eviction when necessary. 1067 f.reclaimCond.Signal() 1068 } 1069 } 1070 } 1071 } 1072 1073 // MarkUnevictable informs f that user no longer considers er to be evictable, 1074 // so the MemoryFile should no longer call user.Evict(er). Note that, per 1075 // EvictableMemoryUser.Evict's documentation, user.Evict(er) may still be 1076 // called even after MarkUnevictable returns due to race conditions, and 1077 // implementations of EvictableMemoryUser must handle this possibility. 1078 // 1079 // Redundantly marking an already-unevictable range as unevictable has no 1080 // effect. 1081 func (f *MemoryFile) MarkUnevictable(user EvictableMemoryUser, er EvictableRange) { 1082 f.mu.Lock() 1083 defer f.mu.Unlock() 1084 info, ok := f.evictable[user] 1085 if !ok { 1086 return 1087 } 1088 seg := info.ranges.LowerBoundSegment(er.Start) 1089 for seg.Ok() && seg.Start() < er.End { 1090 seg = info.ranges.Isolate(seg, er) 1091 seg = info.ranges.Remove(seg).NextSegment() 1092 } 1093 // We can only remove info if there's no eviction goroutine running on its 1094 // behalf. 1095 if !info.evicting && info.ranges.IsEmpty() { 1096 delete(f.evictable, user) 1097 } 1098 } 1099 1100 // MarkAllUnevictable informs f that user no longer considers any offsets to be 1101 // evictable. It otherwise has the same semantics as MarkUnevictable. 1102 func (f *MemoryFile) MarkAllUnevictable(user EvictableMemoryUser) { 1103 f.mu.Lock() 1104 defer f.mu.Unlock() 1105 info, ok := f.evictable[user] 1106 if !ok { 1107 return 1108 } 1109 info.ranges.RemoveAll() 1110 // We can only remove info if there's no eviction goroutine running on its 1111 // behalf. 1112 if !info.evicting { 1113 delete(f.evictable, user) 1114 } 1115 } 1116 1117 // ShouldCacheEvictable returns true if f is meaningfully delaying evictions of 1118 // evictable memory, such that it may be advantageous to cache data in 1119 // evictable memory. The value returned by ShouldCacheEvictable may change 1120 // between calls. 1121 func (f *MemoryFile) ShouldCacheEvictable() bool { 1122 return f.opts.DelayedEviction == DelayedEvictionManual || f.opts.UseHostMemcgPressure 1123 } 1124 1125 // UpdateUsage ensures that the memory usage statistics in 1126 // usage.MemoryAccounting are up to date. If memCgIDs is nil, all the pages 1127 // will be scanned. Else only the pages which belong to the memory cgroup ids 1128 // in memCgIDs will be scanned and the memory usage will be updated. 1129 func (f *MemoryFile) UpdateUsage(memCgIDs map[uint32]struct{}) error { 1130 f.mu.Lock() 1131 defer f.mu.Unlock() 1132 1133 // If the underlying usage matches where the usage tree already 1134 // represents, then we can just avoid the entire scan (we know it's 1135 // accurate). 1136 currentUsage, err := f.TotalUsage() 1137 if err != nil { 1138 return err 1139 } 1140 if currentUsage == f.usageExpected && f.usageSwapped == 0 { 1141 log.Debugf("UpdateUsage: skipped with usageSwapped=0.") 1142 return nil 1143 } 1144 // If the current usage matches the expected but there's swap 1145 // accounting, then ensure a scan takes place at least every second 1146 // (when requested). 1147 if currentUsage == f.usageExpected+f.usageSwapped && time.Now().Before(f.usageLast.Add(time.Second)) { 1148 log.Debugf("UpdateUsage: skipped with usageSwapped!=0.") 1149 return nil 1150 } 1151 1152 // Linux updates usage values at CONFIG_HZ. 1153 if scanningAfter := time.Now().Sub(f.usageLast).Milliseconds(); scanningAfter < time.Second.Milliseconds()/linux.CLOCKS_PER_SEC { 1154 log.Debugf("UpdateUsage: skipped because previous scan happened %d ms back", scanningAfter) 1155 return nil 1156 } 1157 1158 if memCgIDs == nil { 1159 f.usageLast = time.Now() 1160 } 1161 err = f.updateUsageLocked(currentUsage, memCgIDs, false /* alsoScanCommitted */, mincore) 1162 log.Debugf("UpdateUsage: currentUsage=%d, usageExpected=%d, usageSwapped=%d.", 1163 currentUsage, f.usageExpected, f.usageSwapped) 1164 log.Debugf("UpdateUsage: took %v.", time.Since(f.usageLast)) 1165 return err 1166 } 1167 1168 // updateUsageLocked attempts to detect commitment of previously-uncommitted 1169 // pages by invoking checkCommitted, and updates memory accounting to reflect 1170 // newly-committed pages. If alsoScanCommitted is true, updateUsageLocked also 1171 // attempts to detect decommitment of previously-committed pages; this is only 1172 // used by save/restore, which optionally temporarily treats zeroed pages as 1173 // decommitted in order to skip saving them. 1174 // 1175 // For each page i in bs, checkCommitted must set committed[i] to 1 if the page 1176 // is committed and 0 otherwise. off is the offset at which bs begins. 1177 // wasCommitted is true if the page was known-committed before the call to 1178 // checkCommitted and false otherwise; wasCommitted can only be true if 1179 // alsoScanCommitted is true. 1180 // 1181 // Precondition: f.mu must be held; it may be unlocked and reacquired. 1182 // +checklocks:f.mu 1183 func (f *MemoryFile) updateUsageLocked(currentUsage uint64, memCgIDs map[uint32]struct{}, alsoScanCommitted bool, checkCommitted func(bs []byte, committed []byte, off uint64, wasCommitted bool) error) error { 1184 // Track if anything changed to elide the merge. In the common case, we 1185 // expect all segments to be committed and no merge to occur. 1186 changedAny := false 1187 defer func() { 1188 if changedAny { 1189 f.usage.MergeAll() 1190 } 1191 1192 // Adjust the swap usage to reflect reality. 1193 if f.usageExpected < currentUsage { 1194 // Since no pages may be marked decommitted while we hold mu, we 1195 // know that usage may have only increased since we got the last 1196 // current usage. Therefore, if usageExpected is still short of 1197 // currentUsage, we must assume that the difference is in pages 1198 // that have been swapped. 1199 newUsageSwapped := currentUsage - f.usageExpected 1200 if f.usageSwapped < newUsageSwapped { 1201 usage.MemoryAccounting.Inc(newUsageSwapped-f.usageSwapped, usage.System, 0) 1202 } else { 1203 usage.MemoryAccounting.Dec(f.usageSwapped-newUsageSwapped, usage.System, 0) 1204 } 1205 f.usageSwapped = newUsageSwapped 1206 } else if f.usageSwapped != 0 { 1207 // We have more usage accounted for than the file itself. 1208 // That's fine, we probably caught a race where pages were 1209 // being committed while the below loop was running. Just 1210 // report the higher number that we found and ignore swap. 1211 usage.MemoryAccounting.Dec(f.usageSwapped, usage.System, 0) 1212 f.usageSwapped = 0 1213 } 1214 }() 1215 1216 // Reused mincore buffer, will generally be <= 4096 bytes. 1217 var buf []byte 1218 1219 // Iterate over all usage data. There will only be usage segments 1220 // present when there is an associated reference. 1221 for seg := f.usage.FirstSegment(); seg.Ok(); { 1222 if seg.ValuePtr().refs == 0 { 1223 // We assume that reclaimable pages (that aren't already known to 1224 // be committed) are not committed. This isn't necessarily true, 1225 // even after the reclaimer does Decommit(), because the kernel may 1226 // subsequently back the hugepage-sized region containing the 1227 // decommitted page with a hugepage. However, it's consistent with 1228 // our treatment of unallocated pages, which have the same 1229 // property. 1230 seg = seg.NextSegment() 1231 continue 1232 } 1233 wasCommitted := seg.ValuePtr().knownCommitted 1234 if !alsoScanCommitted && wasCommitted { 1235 seg = seg.NextSegment() 1236 continue 1237 } 1238 1239 // Scan the pages of the given memCgID only. This will avoid scanning the 1240 // whole memory file when the memory usage is required only for a specific 1241 // cgroup. The total memory usage of all cgroups can be obtained when the 1242 // memCgIDs is nil. 1243 if memCgIDs != nil { 1244 if _, ok := memCgIDs[seg.ValuePtr().memCgID]; !ok { 1245 seg = seg.NextSegment() 1246 continue 1247 } 1248 } 1249 1250 // Get the range for this segment. As we touch slices, the 1251 // Start value will be walked along. 1252 r := seg.Range() 1253 1254 var checkErr error 1255 err := f.forEachMappingSlice(r, 1256 func(s []byte) { 1257 if checkErr != nil { 1258 return 1259 } 1260 1261 // Ensure that we have sufficient buffer for the call 1262 // (one byte per page). The length of each slice must 1263 // be page-aligned. 1264 bufLen := len(s) / hostarch.PageSize 1265 if len(buf) < bufLen { 1266 buf = make([]byte, bufLen) 1267 } 1268 1269 // Query for new pages in core. 1270 // NOTE(b/165896008): mincore (which is passed as checkCommitted) 1271 // by f.UpdateUsage() might take a really long time. So unlock f.mu 1272 // while checkCommitted runs. 1273 f.mu.Unlock() // +checklocksforce 1274 err := checkCommitted(s, buf, r.Start, wasCommitted) 1275 f.mu.Lock() 1276 if err != nil { 1277 checkErr = err 1278 return 1279 } 1280 1281 // Scan each page and switch out segments. If wasCommitted is 1282 // false, then we are marking ranges that are now committed; 1283 // otherwise, we are marking ranges that are now uncommitted. 1284 unchangedVal := byte(0) 1285 if wasCommitted { 1286 unchangedVal = 1 1287 } 1288 seg := f.usage.LowerBoundSegment(r.Start) 1289 for i := 0; i < bufLen; { 1290 if buf[i]&0x1 == unchangedVal { 1291 i++ 1292 continue 1293 } 1294 // Scan to the end of this changed range. 1295 j := i + 1 1296 for ; j < bufLen; j++ { 1297 if buf[j]&0x1 == unchangedVal { 1298 break 1299 } 1300 } 1301 changedFR := memmap.FileRange{ 1302 Start: r.Start + uint64(i*hostarch.PageSize), 1303 End: r.Start + uint64(j*hostarch.PageSize), 1304 } 1305 // Advance seg to changedFR.Start. 1306 for seg.Ok() && seg.End() <= changedFR.Start { 1307 seg = seg.NextSegment() 1308 } 1309 // Mark pages overlapping changedFR as committed or 1310 // decommitted. 1311 for seg.Ok() && seg.Start() < changedFR.End { 1312 if seg.ValuePtr().refs != 0 && seg.ValuePtr().knownCommitted == wasCommitted { 1313 seg = f.usage.Isolate(seg, changedFR) 1314 seg.ValuePtr().knownCommitted = !wasCommitted 1315 amount := seg.Range().Length() 1316 if wasCommitted { 1317 usage.MemoryAccounting.Dec(amount, seg.ValuePtr().kind, seg.ValuePtr().memCgID) 1318 f.usageExpected -= amount 1319 } else { 1320 usage.MemoryAccounting.Inc(amount, seg.ValuePtr().kind, seg.ValuePtr().memCgID) 1321 f.usageExpected += amount 1322 } 1323 changedAny = true 1324 } 1325 seg = seg.NextSegment() 1326 } 1327 // Continue scanning for changed pages. 1328 i = j + 1 1329 } 1330 1331 // Advance r.Start. 1332 r.Start += uint64(len(s)) 1333 }) 1334 if checkErr != nil { 1335 return checkErr 1336 } 1337 if err != nil { 1338 return err 1339 } 1340 1341 // Continue with the first segment after r.End. 1342 seg = f.usage.LowerBoundSegment(r.End) 1343 } 1344 1345 return nil 1346 } 1347 1348 // TotalUsage returns an aggregate usage for all memory statistics except 1349 // Mapped (which is external to MemoryFile). This is generally much cheaper 1350 // than UpdateUsage, but will not provide a fine-grained breakdown. 1351 func (f *MemoryFile) TotalUsage() (uint64, error) { 1352 // Stat the underlying file to discover the underlying usage. stat(2) 1353 // always reports the allocated block count in units of 512 bytes. This 1354 // includes pages in the page cache and swapped pages. 1355 var stat unix.Stat_t 1356 if err := unix.Fstat(int(f.file.Fd()), &stat); err != nil { 1357 return 0, err 1358 } 1359 return uint64(stat.Blocks * 512), nil 1360 } 1361 1362 // TotalSize returns the current size of the backing file in bytes, which is an 1363 // upper bound on the amount of memory that can currently be allocated from the 1364 // MemoryFile. The value returned by TotalSize is permitted to change. 1365 func (f *MemoryFile) TotalSize() uint64 { 1366 f.mu.Lock() 1367 defer f.mu.Unlock() 1368 return uint64(f.fileSize) 1369 } 1370 1371 // File returns the backing file. 1372 func (f *MemoryFile) File() *os.File { 1373 return f.file 1374 } 1375 1376 // FD implements memmap.File.FD. 1377 func (f *MemoryFile) FD() int { 1378 return int(f.file.Fd()) 1379 } 1380 1381 // IsDiskBacked returns true if f is backed by a file on disk. 1382 func (f *MemoryFile) IsDiskBacked() bool { 1383 return f.opts.DiskBackedFile 1384 } 1385 1386 // String implements fmt.Stringer.String. 1387 // 1388 // Note that because f.String locks f.mu, calling f.String internally 1389 // (including indirectly through the fmt package) risks recursive locking. 1390 // Within the pgalloc package, use f.usage directly instead. 1391 func (f *MemoryFile) String() string { 1392 f.mu.Lock() 1393 defer f.mu.Unlock() 1394 return f.usage.String() 1395 } 1396 1397 // runReclaim implements the reclaimer goroutine, which continuously decommits 1398 // reclaimable pages in order to reduce memory usage and make them available 1399 // for allocation. 1400 func (f *MemoryFile) runReclaim() { 1401 for { 1402 // N.B. We must call f.markReclaimed on the returned FrameRange. 1403 fr, ok := f.findReclaimable() 1404 if !ok { 1405 break 1406 } 1407 1408 if f.opts.ManualZeroing { 1409 // If ManualZeroing is in effect, only hugepage-aligned regions may 1410 // be safely passed to decommitFile. Pages will be zeroed on 1411 // reallocation, so we don't need to perform any manual zeroing 1412 // here, whether or not decommitFile succeeds. 1413 if startAddr, ok := hostarch.Addr(fr.Start).HugeRoundUp(); ok { 1414 if endAddr := hostarch.Addr(fr.End).HugeRoundDown(); startAddr < endAddr { 1415 decommitFR := memmap.FileRange{uint64(startAddr), uint64(endAddr)} 1416 if err := f.decommitFile(decommitFR); err != nil { 1417 log.Warningf("Reclaim failed to decommit %v: %v", decommitFR, err) 1418 } 1419 } 1420 } 1421 } else { 1422 if err := f.decommitFile(fr); err != nil { 1423 log.Warningf("Reclaim failed to decommit %v: %v", fr, err) 1424 // Zero the pages manually. This won't reduce memory usage, but at 1425 // least ensures that the pages will be zero when reallocated. 1426 if err := f.manuallyZero(fr); err != nil { 1427 panic(fmt.Sprintf("Reclaim failed to decommit or zero %v: %v", fr, err)) 1428 } 1429 } 1430 } 1431 f.markDecommitted(fr) 1432 f.markReclaimed(fr) 1433 } 1434 1435 // We only get here if findReclaimable finds f.destroyed set and returns 1436 // false. 1437 f.mu.Lock() 1438 if !f.destroyed { 1439 f.mu.Unlock() 1440 panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set") 1441 } 1442 if f.opts.DecommitOnDestroy && f.fileSize > 0 { 1443 if err := f.decommitFile(memmap.FileRange{Start: 0, End: uint64(f.fileSize)}); err != nil { 1444 f.mu.Unlock() 1445 panic(fmt.Sprintf("failed to decommit entire memory file during destruction: %v", err)) 1446 } 1447 } 1448 f.file.Close() 1449 // Ensure that any attempts to use f.file.Fd() fail instead of getting a fd 1450 // that has possibly been reassigned. 1451 f.file = nil 1452 f.mappingsMu.Lock() 1453 defer f.mappingsMu.Unlock() 1454 mappings := *f.mappings.Load() 1455 for i, m := range mappings { 1456 if m != 0 { 1457 _, _, errno := unix.Syscall(unix.SYS_MUNMAP, m, chunkSize, 0) 1458 if errno != 0 { 1459 log.Warningf("Failed to unmap mapping %#x for MemoryFile chunk %d: %v", m, i, errno) 1460 } 1461 } 1462 } 1463 // Similarly, invalidate f.mappings 1464 f.mappings.Store(nil) 1465 f.mu.Unlock() 1466 1467 // This must be called without holding f.mu to avoid circular lock 1468 // ordering. 1469 if f.stopNotifyPressure != nil { 1470 f.stopNotifyPressure() 1471 } 1472 } 1473 1474 // findReclaimable finds memory that has been marked for reclaim. 1475 // 1476 // Note that there returned range will be removed from tracking. It 1477 // must be reclaimed (removed from f.usage) at this point. 1478 func (f *MemoryFile) findReclaimable() (memmap.FileRange, bool) { 1479 f.mu.Lock() 1480 defer f.mu.Unlock() 1481 for { 1482 for { 1483 if f.destroyed { 1484 return memmap.FileRange{}, false 1485 } 1486 if f.reclaimable { 1487 break 1488 } 1489 if f.opts.DelayedEviction == DelayedEvictionEnabled && !f.opts.UseHostMemcgPressure { 1490 // No work to do. Evict any pending evictable allocations to 1491 // get more reclaimable pages before going to sleep. 1492 f.startEvictionsLocked() 1493 } 1494 f.reclaimCond.Wait() 1495 } 1496 // Most allocations are done upwards, with exceptions being stacks and some 1497 // allocators that allocate top-down. Reclaim preserves this order to 1498 // minimize the cost of the search. 1499 if seg := f.reclaim.FirstSegment(); seg.Ok() { 1500 fr := seg.Range() 1501 f.reclaim.Remove(seg) 1502 return fr, true 1503 } 1504 // Nothing is reclaimable. 1505 f.reclaimable = false 1506 } 1507 } 1508 1509 func (f *MemoryFile) markReclaimed(fr memmap.FileRange) { 1510 f.mu.Lock() 1511 defer f.mu.Unlock() 1512 seg := f.usage.FindSegment(fr.Start) 1513 // All of fr should be mapped to a single uncommitted reclaimable 1514 // segment accounted to System. 1515 if !seg.Ok() { 1516 panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage)) 1517 } 1518 if !seg.Range().IsSupersetOf(fr) { 1519 panic(fmt.Sprintf("reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage)) 1520 } 1521 if got, want := seg.Value(), (usageInfo{ 1522 kind: usage.System, 1523 knownCommitted: false, 1524 refs: 0, 1525 memCgID: 0, 1526 }); got != want { 1527 panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage)) 1528 } 1529 // Deallocate reclaimed pages. Even though all of seg is reclaimable, 1530 // the caller of markReclaimed may not have decommitted it, so we can 1531 // only mark fr as reclaimed. 1532 f.usage.Remove(f.usage.Isolate(seg, fr)) 1533 } 1534 1535 // StartEvictions requests that f evict all evictable allocations. It does not 1536 // wait for eviction to complete; for this, see MemoryFile.WaitForEvictions. 1537 func (f *MemoryFile) StartEvictions() { 1538 f.mu.Lock() 1539 defer f.mu.Unlock() 1540 f.startEvictionsLocked() 1541 } 1542 1543 // Preconditions: f.mu must be locked. 1544 func (f *MemoryFile) startEvictionsLocked() bool { 1545 startedAny := false 1546 for user, info := range f.evictable { 1547 // Don't start multiple goroutines to evict the same user's 1548 // allocations. 1549 if !info.evicting { 1550 f.startEvictionGoroutineLocked(user, info) 1551 startedAny = true 1552 } 1553 } 1554 return startedAny 1555 } 1556 1557 // Preconditions: 1558 // - info == f.evictable[user]. 1559 // - !info.evicting. 1560 // - f.mu must be locked. 1561 func (f *MemoryFile) startEvictionGoroutineLocked(user EvictableMemoryUser, info *evictableMemoryUserInfo) { 1562 info.evicting = true 1563 f.evictionWG.Add(1) 1564 go func() { // S/R-SAFE: f.evictionWG 1565 defer f.evictionWG.Done() 1566 for { 1567 f.mu.Lock() 1568 info, ok := f.evictable[user] 1569 if !ok { 1570 // This shouldn't happen: only this goroutine is permitted 1571 // to delete this entry. 1572 f.mu.Unlock() 1573 panic(fmt.Sprintf("evictableMemoryUserInfo for EvictableMemoryUser %v deleted while eviction goroutine running", user)) 1574 } 1575 if info.ranges.IsEmpty() { 1576 delete(f.evictable, user) 1577 f.mu.Unlock() 1578 return 1579 } 1580 // Evict from the end of info.ranges, under the assumption that 1581 // if ranges in user start being used again (and are 1582 // consequently marked unevictable), such uses are more likely 1583 // to start from the beginning of user. 1584 seg := info.ranges.LastSegment() 1585 er := seg.Range() 1586 info.ranges.Remove(seg) 1587 // user.Evict() must be called without holding f.mu to avoid 1588 // circular lock ordering. 1589 f.mu.Unlock() 1590 user.Evict(context.Background(), er) 1591 } 1592 }() 1593 } 1594 1595 // WaitForEvictions blocks until f is no longer evicting any evictable 1596 // allocations. 1597 func (f *MemoryFile) WaitForEvictions() { 1598 f.evictionWG.Wait() 1599 } 1600 1601 type usageSetFunctions struct{} 1602 1603 func (usageSetFunctions) MinKey() uint64 { 1604 return 0 1605 } 1606 1607 func (usageSetFunctions) MaxKey() uint64 { 1608 return math.MaxUint64 1609 } 1610 1611 func (usageSetFunctions) ClearValue(val *usageInfo) { 1612 } 1613 1614 func (usageSetFunctions) Merge(_ memmap.FileRange, val1 usageInfo, _ memmap.FileRange, val2 usageInfo) (usageInfo, bool) { 1615 return val1, val1 == val2 1616 } 1617 1618 func (usageSetFunctions) Split(_ memmap.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) { 1619 return val, val 1620 } 1621 1622 // evictableRangeSetValue is the value type of evictableRangeSet. 1623 type evictableRangeSetValue struct{} 1624 1625 type evictableRangeSetFunctions struct{} 1626 1627 func (evictableRangeSetFunctions) MinKey() uint64 { 1628 return 0 1629 } 1630 1631 func (evictableRangeSetFunctions) MaxKey() uint64 { 1632 return math.MaxUint64 1633 } 1634 1635 func (evictableRangeSetFunctions) ClearValue(val *evictableRangeSetValue) { 1636 } 1637 1638 func (evictableRangeSetFunctions) Merge(_ EvictableRange, _ evictableRangeSetValue, _ EvictableRange, _ evictableRangeSetValue) (evictableRangeSetValue, bool) { 1639 return evictableRangeSetValue{}, true 1640 } 1641 1642 func (evictableRangeSetFunctions) Split(_ EvictableRange, _ evictableRangeSetValue, _ uint64) (evictableRangeSetValue, evictableRangeSetValue) { 1643 return evictableRangeSetValue{}, evictableRangeSetValue{} 1644 } 1645 1646 // reclaimSetValue is the value type of reclaimSet. 1647 type reclaimSetValue struct{} 1648 1649 type reclaimSetFunctions struct{} 1650 1651 func (reclaimSetFunctions) MinKey() uint64 { 1652 return 0 1653 } 1654 1655 func (reclaimSetFunctions) MaxKey() uint64 { 1656 return math.MaxUint64 1657 } 1658 1659 func (reclaimSetFunctions) ClearValue(val *reclaimSetValue) { 1660 } 1661 1662 func (reclaimSetFunctions) Merge(_ memmap.FileRange, _ reclaimSetValue, _ memmap.FileRange, _ reclaimSetValue) (reclaimSetValue, bool) { 1663 return reclaimSetValue{}, true 1664 } 1665 1666 func (reclaimSetFunctions) Split(_ memmap.FileRange, _ reclaimSetValue, _ uint64) (reclaimSetValue, reclaimSetValue) { 1667 return reclaimSetValue{}, reclaimSetValue{} 1668 }