github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/pgalloc/pgalloc.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package pgalloc contains the page allocator subsystem, which manages memory 16 // that may be mapped into application address spaces. 17 // 18 // Lock order: 19 // 20 // pgalloc.MemoryFile.mu 21 // pgalloc.MemoryFile.mappingsMu 22 package pgalloc 23 24 import ( 25 "fmt" 26 "math" 27 "os" 28 "sync/atomic" 29 "time" 30 31 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 32 "github.com/MerlinKodo/gvisor/pkg/atomicbitops" 33 "github.com/MerlinKodo/gvisor/pkg/context" 34 "github.com/MerlinKodo/gvisor/pkg/errors/linuxerr" 35 "github.com/MerlinKodo/gvisor/pkg/hostarch" 36 "github.com/MerlinKodo/gvisor/pkg/log" 37 "github.com/MerlinKodo/gvisor/pkg/safemem" 38 "github.com/MerlinKodo/gvisor/pkg/sentry/hostmm" 39 "github.com/MerlinKodo/gvisor/pkg/sentry/memmap" 40 "github.com/MerlinKodo/gvisor/pkg/sentry/usage" 41 "github.com/MerlinKodo/gvisor/pkg/sync" 42 "golang.org/x/sys/unix" 43 ) 44 45 // Direction describes how to allocate offsets from MemoryFile. 46 type Direction int 47 48 const ( 49 // BottomUp allocates offsets in increasing offsets. 50 BottomUp Direction = iota 51 // TopDown allocates offsets in decreasing offsets. 52 TopDown 53 ) 54 55 // String implements fmt.Stringer. 56 func (d Direction) String() string { 57 switch d { 58 case BottomUp: 59 return "up" 60 case TopDown: 61 return "down" 62 } 63 panic(fmt.Sprintf("invalid direction: %d", d)) 64 } 65 66 // MemoryFile is a memmap.File whose pages may be allocated to arbitrary 67 // users. 68 type MemoryFile struct { 69 // opts holds options passed to NewMemoryFile. opts is immutable. 70 opts MemoryFileOpts 71 72 // MemoryFile owns a single backing file, which is modeled as follows: 73 // 74 // Each page in the file can be committed or uncommitted. A page is 75 // committed if the host kernel is spending resources to store its contents 76 // and uncommitted otherwise. This definition includes pages that the host 77 // kernel has swapped; this is intentional, to ensure that accounting does 78 // not change even if host kernel swapping behavior changes, and that 79 // memory used by pseudo-swap mechanisms like zswap is still accounted. 80 // 81 // The initial contents of uncommitted pages are implicitly zero bytes. A 82 // read or write to the contents of an uncommitted page causes it to be 83 // committed. This is the only event that can cause a uncommitted page to 84 // be committed. 85 // 86 // fallocate(FALLOC_FL_PUNCH_HOLE) (MemoryFile.Decommit) causes committed 87 // pages to be uncommitted. This is the only event that can cause a 88 // committed page to be uncommitted. 89 // 90 // Memory accounting is based on identifying the set of committed pages. 91 // Since we do not have direct access to the MMU, tracking reads and writes 92 // to uncommitted pages to detect commitment would introduce additional 93 // page faults, which would be prohibitively expensive. Instead, we query 94 // the host kernel to determine which pages are committed. 95 96 // file is the backing file. The file pointer is immutable. 97 file *os.File 98 99 mu memoryFileMutex 100 101 // usage maps each page in the file to metadata for that page. Pages for 102 // which no segment exists in usage are both unallocated (not in use) and 103 // uncommitted. 104 // 105 // Since usage stores usageInfo objects by value, clients should usually 106 // use usageIterator.ValuePtr() instead of usageIterator.Value() to get a 107 // pointer to the usageInfo rather than a copy. 108 // 109 // usage must be kept maximally merged (that is, there should never be two 110 // adjacent segments with the same values). At least markReclaimed depends 111 // on this property. 112 // 113 // usage is protected by mu. 114 usage usageSet 115 116 // The UpdateUsage function scans all segments with knownCommitted set 117 // to false, sees which pages are committed and creates corresponding 118 // segments with knownCommitted set to true. 119 // 120 // In order to avoid unnecessary scans, usageExpected tracks the total 121 // file blocks expected. This is used to elide the scan when this 122 // matches the underlying file blocks. 123 // 124 // To track swapped pages, usageSwapped tracks the discrepency between 125 // what is observed in core and what is reported by the file. When 126 // usageSwapped is non-zero, a sweep will be performed at least every 127 // second. The start of the last sweep is recorded in usageLast. 128 // 129 // All usage attributes are all protected by mu. 130 usageExpected uint64 131 usageSwapped uint64 132 usageLast time.Time 133 134 // fileSize is the size of the backing memory file in bytes. fileSize is 135 // always a power-of-two multiple of chunkSize. 136 // 137 // fileSize is protected by mu. 138 fileSize int64 139 140 // Pages from the backing file are mapped into the local address space on 141 // the granularity of large pieces called chunks. mappings is a []uintptr 142 // that stores, for each chunk, the start address of a mapping of that 143 // chunk in the current process' address space, or 0 if no such mapping 144 // exists. Once a chunk is mapped, it is never remapped or unmapped until 145 // the MemoryFile is destroyed. 146 // 147 // Mutating the mappings slice or its contents requires both holding 148 // mappingsMu and using atomic memory operations. (The slice is mutated 149 // whenever the file is expanded. Per the above, the only permitted 150 // mutation of the slice's contents is the assignment of a mapping to a 151 // chunk that was previously unmapped.) Reading the slice or its contents 152 // only requires *either* holding mappingsMu or using atomic memory 153 // operations. This allows MemoryFile.MapInternal to avoid locking in the 154 // common case where chunk mappings already exist. 155 mappingsMu mappingsMutex 156 mappings atomic.Value 157 158 // destroyed is set by Destroy to instruct the reclaimer goroutine to 159 // release resources and exit. destroyed is protected by mu. 160 destroyed bool 161 162 // reclaimable is true if usage may contain reclaimable pages. reclaimable 163 // is protected by mu. 164 reclaimable bool 165 166 // reclaim is the collection of regions for reclaim. reclaim is protected 167 // by mu. 168 reclaim reclaimSet 169 170 // reclaimCond is signaled (with mu locked) when reclaimable or destroyed 171 // transitions from false to true. 172 reclaimCond sync.Cond 173 174 // evictable maps EvictableMemoryUsers to eviction state. 175 // 176 // evictable is protected by mu. 177 evictable map[EvictableMemoryUser]*evictableMemoryUserInfo 178 179 // evictionWG counts the number of goroutines currently performing evictions. 180 evictionWG sync.WaitGroup 181 182 // stopNotifyPressure stops memory cgroup pressure level 183 // notifications used to drive eviction. stopNotifyPressure is 184 // immutable. 185 stopNotifyPressure func() 186 } 187 188 // MemoryFileOpts provides options to NewMemoryFile. 189 type MemoryFileOpts struct { 190 // DelayedEviction controls the extent to which the MemoryFile may delay 191 // eviction of evictable allocations. 192 DelayedEviction DelayedEvictionType 193 194 // If UseHostMemcgPressure is true, use host memory cgroup pressure level 195 // notifications to determine when eviction is necessary. This option has 196 // no effect unless DelayedEviction is DelayedEvictionEnabled. 197 UseHostMemcgPressure bool 198 199 // DecommitOnDestroy indicates whether the entire host file should be 200 // decommitted on destruction. This is appropriate for host filesystem based 201 // files that need to be explicitly cleaned up to release disk space. 202 DecommitOnDestroy bool 203 204 // If ManualZeroing is true, MemoryFile must not assume that new pages 205 // obtained from the host are zero-filled, such that MemoryFile must manually 206 // zero newly-allocated pages. 207 ManualZeroing bool 208 209 // If DisableIMAWorkAround is true, NewMemoryFile will not call 210 // IMAWorkAroundForMemFile(). 211 DisableIMAWorkAround bool 212 213 // DiskBackedFile indicates that the MemoryFile is backed by a file on disk. 214 DiskBackedFile bool 215 } 216 217 // DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction. 218 type DelayedEvictionType int 219 220 const ( 221 // DelayedEvictionDefault has unspecified behavior. 222 DelayedEvictionDefault DelayedEvictionType = iota 223 224 // DelayedEvictionDisabled requires that evictable allocations are evicted 225 // as soon as possible. 226 DelayedEvictionDisabled 227 228 // DelayedEvictionEnabled requests that the MemoryFile delay eviction of 229 // evictable allocations until doing so is considered necessary to avoid 230 // performance degradation due to host memory pressure, or OOM kills. 231 // 232 // As of this writing, the behavior of DelayedEvictionEnabled depends on 233 // whether or not MemoryFileOpts.UseHostMemcgPressure is enabled: 234 // 235 // - If UseHostMemcgPressure is true, evictions are delayed until memory 236 // pressure is indicated. 237 // 238 // - Otherwise, evictions are only delayed until the reclaimer goroutine 239 // is out of work (pages to reclaim). 240 DelayedEvictionEnabled 241 242 // DelayedEvictionManual requires that evictable allocations are only 243 // evicted when MemoryFile.StartEvictions() is called. This is extremely 244 // dangerous outside of tests. 245 DelayedEvictionManual 246 ) 247 248 // usageInfo tracks usage information. 249 // 250 // +stateify savable 251 type usageInfo struct { 252 // kind is the usage kind. 253 kind usage.MemoryKind 254 255 // knownCommitted is true if the tracked region is definitely committed. 256 // (If it is false, the tracked region may or may not be committed.) 257 knownCommitted bool 258 259 refs uint64 260 261 // memCgID is the memory cgroup id to which this page is committed. 262 memCgID uint32 263 } 264 265 // canCommit returns true if the tracked region can be committed. 266 func (u *usageInfo) canCommit() bool { 267 // refs must be greater than 0 because we assume that reclaimable pages 268 // (that aren't already known to be committed) are not committed. This 269 // isn't necessarily true, even after the reclaimer does Decommit(), 270 // because the kernel may subsequently back the hugepage-sized region 271 // containing the decommitted page with a hugepage. However, it's 272 // consistent with our treatment of unallocated pages, which have the same 273 // property. 274 return !u.knownCommitted && u.refs != 0 275 } 276 277 // An EvictableMemoryUser represents a user of MemoryFile-allocated memory that 278 // may be asked to deallocate that memory in the presence of memory pressure. 279 type EvictableMemoryUser interface { 280 // Evict requests that the EvictableMemoryUser deallocate memory used by 281 // er, which was registered as evictable by a previous call to 282 // MemoryFile.MarkEvictable. 283 // 284 // Evict is not required to deallocate memory. In particular, since pgalloc 285 // must call Evict without holding locks to avoid circular lock ordering, 286 // it is possible that the passed range has already been marked as 287 // unevictable by a racing call to MemoryFile.MarkUnevictable. 288 // Implementations of EvictableMemoryUser must detect such races and handle 289 // them by making Evict have no effect on unevictable ranges. 290 // 291 // After a call to Evict, the MemoryFile will consider the evicted range 292 // unevictable (i.e. it will not call Evict on the same range again) until 293 // informed otherwise by a subsequent call to MarkEvictable. 294 Evict(ctx context.Context, er EvictableRange) 295 } 296 297 // An EvictableRange represents a range of uint64 offsets in an 298 // EvictableMemoryUser. 299 // 300 // In practice, most EvictableMemoryUsers will probably be implementations of 301 // memmap.Mappable, and EvictableRange therefore corresponds to 302 // memmap.MappableRange. However, this package cannot depend on the memmap 303 // package, since doing so would create a circular dependency. 304 // 305 // type EvictableRange <generated using go_generics> 306 307 // evictableMemoryUserInfo is the value type of MemoryFile.evictable. 308 type evictableMemoryUserInfo struct { 309 // ranges tracks all evictable ranges for the given user. 310 ranges evictableRangeSet 311 312 // If evicting is true, there is a goroutine currently evicting all 313 // evictable ranges for this user. 314 evicting bool 315 } 316 317 const ( 318 chunkShift = 30 319 chunkSize = 1 << chunkShift // 1 GB 320 chunkMask = chunkSize - 1 321 322 // maxPage is the highest 64-bit page. 323 maxPage = math.MaxUint64 &^ (hostarch.PageSize - 1) 324 ) 325 326 // NewMemoryFile creates a MemoryFile backed by the given file. If 327 // NewMemoryFile succeeds, ownership of file is transferred to the returned 328 // MemoryFile. 329 func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) { 330 switch opts.DelayedEviction { 331 case DelayedEvictionDefault: 332 opts.DelayedEviction = DelayedEvictionEnabled 333 case DelayedEvictionDisabled, DelayedEvictionManual: 334 opts.UseHostMemcgPressure = false 335 case DelayedEvictionEnabled: 336 // ok 337 default: 338 return nil, fmt.Errorf("invalid MemoryFileOpts.DelayedEviction: %v", opts.DelayedEviction) 339 } 340 341 // Truncate the file to 0 bytes first to ensure that it's empty. 342 if err := file.Truncate(0); err != nil { 343 return nil, err 344 } 345 f := &MemoryFile{ 346 opts: opts, 347 file: file, 348 evictable: make(map[EvictableMemoryUser]*evictableMemoryUserInfo), 349 } 350 f.mappings.Store(make([]uintptr, 0)) 351 f.reclaimCond.L = &f.mu 352 353 if f.opts.DelayedEviction == DelayedEvictionEnabled && f.opts.UseHostMemcgPressure { 354 stop, err := hostmm.NotifyCurrentMemcgPressureCallback(func() { 355 f.mu.Lock() 356 startedAny := f.startEvictionsLocked() 357 f.mu.Unlock() 358 if startedAny { 359 log.Debugf("pgalloc.MemoryFile performing evictions due to memcg pressure") 360 } 361 }, "low") 362 if err != nil { 363 return nil, fmt.Errorf("failed to configure memcg pressure level notifications: %v", err) 364 } 365 f.stopNotifyPressure = stop 366 } 367 368 go f.runReclaim() // S/R-SAFE: f.mu 369 370 if !opts.DisableIMAWorkAround { 371 IMAWorkAroundForMemFile(file.Fd()) 372 } 373 return f, nil 374 } 375 376 // IMAWorkAroundForMemFile works around IMA by immediately creating a temporary 377 // PROT_EXEC mapping, while the backing file is still small. IMA will ignore 378 // any future mappings. 379 // 380 // The Linux kernel contains an optional feature called "Integrity 381 // Measurement Architecture" (IMA). If IMA is enabled, it will checksum 382 // binaries the first time they are mapped PROT_EXEC. This is bad news for 383 // executable pages mapped from our backing file, which can grow to 384 // terabytes in (sparse) size. If IMA attempts to checksum a file that 385 // large, it will allocate all of the sparse pages and quickly exhaust all 386 // memory. 387 func IMAWorkAroundForMemFile(fd uintptr) { 388 m, _, errno := unix.Syscall6( 389 unix.SYS_MMAP, 390 0, 391 hostarch.PageSize, 392 unix.PROT_EXEC, 393 unix.MAP_SHARED, 394 fd, 395 0) 396 if errno != 0 { 397 // This isn't fatal (IMA may not even be in use). Log the error, but 398 // don't return it. 399 log.Warningf("Failed to pre-map MemoryFile PROT_EXEC: %v", errno) 400 } else { 401 if _, _, errno := unix.Syscall( 402 unix.SYS_MUNMAP, 403 m, 404 hostarch.PageSize, 405 0); errno != 0 { 406 panic(fmt.Sprintf("failed to unmap PROT_EXEC MemoryFile mapping: %v", errno)) 407 } 408 } 409 } 410 411 // Destroy releases all resources used by f. 412 // 413 // Preconditions: All pages allocated by f have been freed. 414 // 415 // Postconditions: None of f's methods may be called after Destroy. 416 func (f *MemoryFile) Destroy() { 417 f.mu.Lock() 418 defer f.mu.Unlock() 419 f.destroyed = true 420 f.reclaimCond.Signal() 421 } 422 423 // AllocationMode provides a way to inform the pgalloc API how to allocate 424 // memory and pages on the host. 425 // A page will exist in one of the following incremental states: 426 // 1. Allocated: A page is allocated if it was returned by Allocate() and its 427 // reference count hasn't dropped to 0 since then. 428 // 2. Committed: As described in MemoryFile documentation above, a page is 429 // committed if the host kernel is spending resources to store its 430 // contents. A committed page is implicitly allocated. 431 // 3. Populated: A page is populated for reading/writing in a page table 432 // hierarchy if it has a page table entry that permits reading/writing 433 // respectively. A populated page is implicitly committed, since the page 434 // table entry needs a physical page to point to, but not vice versa. 435 type AllocationMode int 436 437 const ( 438 // AllocateOnly indicates that pages need to only be allocated. 439 AllocateOnly AllocationMode = iota 440 // AllocateAndCommit indicates that pages need to be committed, in addition 441 // to being allocated. 442 AllocateAndCommit 443 // AllocateAndWritePopulate indicates that writable pages should ideally be 444 // populated in the page table, in addition to being allocated. This is a 445 // suggestion, not a requirement. 446 AllocateAndWritePopulate 447 ) 448 449 // AllocOpts are options used in MemoryFile.Allocate. 450 type AllocOpts struct { 451 // Kind is the memory kind to be used for accounting. 452 Kind usage.MemoryKind 453 // Dir indicates the direction in which offsets are allocated. 454 Dir Direction 455 // MemCgID is the memory cgroup ID and the zero value indicates that 456 // the memory will not be accounted to any cgroup. 457 MemCgID uint32 458 // Mode allows the callers to select how the pages are allocated in the 459 // MemoryFile. Callers that will fill the allocated memory by writing to it 460 // should pass AllocateAndWritePopulate to avoid faulting page-by-page. Callers 461 // that will fill the allocated memory by invoking host system calls should 462 // pass AllocateOnly. 463 Mode AllocationMode 464 // If Reader is provided, the allocated memory is filled by calling 465 // ReadToBlocks() repeatedly until either length bytes are read or a non-nil 466 // error is returned. It returns the allocated memory, truncated down to the 467 // nearest page. If this is shorter than length bytes due to an error 468 // returned by ReadToBlocks(), it returns the partially filled fr and error. 469 Reader safemem.Reader 470 } 471 472 // Allocate returns a range of initially-zeroed pages of the given length with 473 // the given accounting kind and a single reference held by the caller. When 474 // the last reference on an allocated page is released, ownership of the page 475 // is returned to the MemoryFile, allowing it to be returned by a future call 476 // to Allocate. 477 // 478 // Preconditions: length must be page-aligned and non-zero. 479 func (f *MemoryFile) Allocate(length uint64, opts AllocOpts) (memmap.FileRange, error) { 480 fr, err := f.allocate(length, &opts) 481 if err != nil { 482 return memmap.FileRange{}, err 483 } 484 var dsts safemem.BlockSeq 485 switch opts.Mode { 486 case AllocateOnly: // Allocation is handled above. Nothing more to do. 487 case AllocateAndCommit: 488 if err := f.commitFile(fr); err != nil { 489 f.DecRef(fr) 490 return memmap.FileRange{}, err 491 } 492 case AllocateAndWritePopulate: 493 dsts, err = f.MapInternal(fr, hostarch.Write) 494 if err != nil { 495 f.DecRef(fr) 496 return memmap.FileRange{}, err 497 } 498 if canPopulate() { 499 rem := dsts 500 for { 501 if !tryPopulate(rem.Head()) { 502 break 503 } 504 rem = rem.Tail() 505 if rem.IsEmpty() { 506 break 507 } 508 } 509 } 510 default: 511 panic(fmt.Sprintf("unknown allocation mode: %d", opts.Mode)) 512 } 513 if opts.Reader != nil { 514 if dsts.IsEmpty() { 515 dsts, err = f.MapInternal(fr, hostarch.Write) 516 if err != nil { 517 f.DecRef(fr) 518 return memmap.FileRange{}, err 519 } 520 } 521 n, err := safemem.ReadFullToBlocks(opts.Reader, dsts) 522 un := uint64(hostarch.Addr(n).RoundDown()) 523 if un < length { 524 // Free unused memory and update fr to contain only the memory that is 525 // still allocated. 526 f.DecRef(memmap.FileRange{fr.Start + un, fr.End}) 527 fr.End = fr.Start + un 528 } 529 if err != nil { 530 return fr, err 531 } 532 } 533 return fr, nil 534 } 535 536 func (f *MemoryFile) allocate(length uint64, opts *AllocOpts) (memmap.FileRange, error) { 537 if length == 0 || length%hostarch.PageSize != 0 { 538 panic(fmt.Sprintf("invalid allocation length: %#x", length)) 539 } 540 541 f.mu.Lock() 542 defer f.mu.Unlock() 543 544 // Align hugepage-and-larger allocations on hugepage boundaries to try 545 // to take advantage of hugetmpfs. 546 alignment := uint64(hostarch.PageSize) 547 if length >= hostarch.HugePageSize { 548 alignment = hostarch.HugePageSize 549 } 550 551 // Find a range in the underlying file. 552 fr, ok := f.findAvailableRange(length, alignment, opts.Dir) 553 if !ok { 554 return memmap.FileRange{}, linuxerr.ENOMEM 555 } 556 557 // Expand the file if needed. 558 if int64(fr.End) > f.fileSize { 559 // Round the new file size up to be chunk-aligned. 560 newFileSize := (int64(fr.End) + chunkMask) &^ chunkMask 561 if err := f.file.Truncate(newFileSize); err != nil { 562 return memmap.FileRange{}, err 563 } 564 f.fileSize = newFileSize 565 f.mappingsMu.Lock() 566 oldMappings := f.mappings.Load().([]uintptr) 567 newMappings := make([]uintptr, newFileSize>>chunkShift) 568 copy(newMappings, oldMappings) 569 f.mappings.Store(newMappings) 570 f.mappingsMu.Unlock() 571 } 572 573 if f.opts.ManualZeroing { 574 if err := f.manuallyZero(fr); err != nil { 575 return memmap.FileRange{}, err 576 } 577 } 578 // Mark selected pages as in use. 579 if !f.usage.Add(fr, usageInfo{ 580 kind: opts.Kind, 581 refs: 1, 582 memCgID: opts.MemCgID, 583 }) { 584 panic(fmt.Sprintf("allocating %v: failed to insert into usage set:\n%v", fr, &f.usage)) 585 } 586 587 return fr, nil 588 } 589 590 // findAvailableRange returns an available range in the usageSet. 591 // 592 // Note that scanning for available slots takes place from end first backwards, 593 // then forwards. This heuristic has important consequence for how sequential 594 // mappings can be merged in the host VMAs, given that addresses for both 595 // application and sentry mappings are allocated top-down (from higher to 596 // lower addresses). The file is also grown exponentially in order to create 597 // space for mappings to be allocated downwards. 598 // 599 // Precondition: alignment must be a power of 2. 600 func (f *MemoryFile) findAvailableRange(length, alignment uint64, dir Direction) (memmap.FileRange, bool) { 601 if dir == BottomUp { 602 return findAvailableRangeBottomUp(&f.usage, length, alignment) 603 } 604 return findAvailableRangeTopDown(&f.usage, f.fileSize, length, alignment) 605 } 606 607 func findAvailableRangeTopDown(usage *usageSet, fileSize int64, length, alignment uint64) (memmap.FileRange, bool) { 608 alignmentMask := alignment - 1 609 610 // Search for space in existing gaps, starting at the current end of the 611 // file and working backward. 612 lastGap := usage.LastGap() 613 gap := lastGap 614 for { 615 end := gap.End() 616 if end > uint64(fileSize) { 617 end = uint64(fileSize) 618 } 619 620 // Try to allocate from the end of this gap, with the start of the 621 // allocated range aligned down to alignment. 622 unalignedStart := end - length 623 if unalignedStart > end { 624 // Negative overflow: this and all preceding gaps are too small to 625 // accommodate length. 626 break 627 } 628 if start := unalignedStart &^ alignmentMask; start >= gap.Start() { 629 return memmap.FileRange{start, start + length}, true 630 } 631 632 gap = gap.PrevLargeEnoughGap(length) 633 if !gap.Ok() { 634 break 635 } 636 } 637 638 // Check that it's possible to fit this allocation at the end of a file of any size. 639 min := lastGap.Start() 640 min = (min + alignmentMask) &^ alignmentMask 641 if min+length < min { 642 // Overflow: allocation would exceed the range of uint64. 643 return memmap.FileRange{}, false 644 } 645 646 // Determine the minimum file size required to fit this allocation at its end. 647 for { 648 newFileSize := 2 * fileSize 649 if newFileSize <= fileSize { 650 if fileSize != 0 { 651 // Overflow: allocation would exceed the range of int64. 652 return memmap.FileRange{}, false 653 } 654 newFileSize = chunkSize 655 } 656 fileSize = newFileSize 657 658 unalignedStart := uint64(fileSize) - length 659 if unalignedStart > uint64(fileSize) { 660 // Negative overflow: fileSize is still inadequate. 661 continue 662 } 663 if start := unalignedStart &^ alignmentMask; start >= min { 664 return memmap.FileRange{start, start + length}, true 665 } 666 } 667 } 668 669 func findAvailableRangeBottomUp(usage *usageSet, length, alignment uint64) (memmap.FileRange, bool) { 670 alignmentMask := alignment - 1 671 for gap := usage.FirstGap(); gap.Ok(); gap = gap.NextLargeEnoughGap(length) { 672 // Align the start address and check if allocation still fits in the gap. 673 start := (gap.Start() + alignmentMask) &^ alignmentMask 674 675 // File offsets are int64s. Since length must be strictly positive, end 676 // cannot legitimately be 0. 677 end := start + length 678 if end < start || int64(end) <= 0 { 679 return memmap.FileRange{}, false 680 } 681 if end <= gap.End() { 682 return memmap.FileRange{start, end}, true 683 } 684 } 685 686 // NextLargeEnoughGap should have returned a gap at the end. 687 panic(fmt.Sprintf("NextLargeEnoughGap didn't return a gap at the end, length: %d", length)) 688 } 689 690 var mlockDisabled atomicbitops.Uint32 691 var madvPopulateWriteDisabled atomicbitops.Uint32 692 693 func canPopulate() bool { 694 return mlockDisabled.Load() == 0 || madvPopulateWriteDisabled.Load() == 0 695 } 696 697 func tryPopulateMadv(b safemem.Block) bool { 698 if madvPopulateWriteDisabled.Load() != 0 { 699 return false 700 } 701 start, ok := hostarch.Addr(b.Addr()).RoundUp() 702 if !ok { 703 return true 704 } 705 end := hostarch.Addr(b.Addr() + uintptr(b.Len())).RoundDown() 706 bLen := end - start 707 // Only call madvise(MADV_POPULATE_WRITE) if >=2 pages are being populated. 708 // 1 syscall overhead >= 1 page fault overhead. This is because syscalls are 709 // susceptible to additional overheads like seccomp-bpf filters and auditing. 710 if start >= end || bLen <= hostarch.PageSize { 711 return true 712 } 713 _, _, errno := unix.RawSyscall(unix.SYS_MADVISE, uintptr(start), uintptr(bLen), unix.MADV_POPULATE_WRITE) 714 if errno != 0 { 715 if errno == unix.EINVAL { 716 // EINVAL is expected if MADV_POPULATE_WRITE is not supported (Linux <5.14). 717 log.Infof("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: madvise failed: %s", errno) 718 } else { 719 log.Warningf("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: madvise failed: %s", errno) 720 } 721 madvPopulateWriteDisabled.Store(1) 722 return false 723 } 724 return true 725 } 726 727 func tryPopulateMlock(b safemem.Block) bool { 728 if mlockDisabled.Load() != 0 { 729 return false 730 } 731 // Call mlock to populate pages, then munlock to cancel the mlock (but keep 732 // the pages populated). Only do so for hugepage-aligned address ranges to 733 // ensure that splitting the VMA in mlock doesn't split any existing 734 // hugepages. This assumes that two host syscalls, plus the MM overhead of 735 // mlock + munlock, is faster on average than trapping for 736 // HugePageSize/PageSize small page faults. 737 start, ok := hostarch.Addr(b.Addr()).HugeRoundUp() 738 if !ok { 739 return true 740 } 741 end := hostarch.Addr(b.Addr() + uintptr(b.Len())).HugeRoundDown() 742 if start >= end { 743 return true 744 } 745 _, _, errno := unix.Syscall(unix.SYS_MLOCK, uintptr(start), uintptr(end-start), 0) 746 unix.RawSyscall(unix.SYS_MUNLOCK, uintptr(start), uintptr(end-start), 0) 747 if errno != 0 { 748 if errno == unix.ENOMEM || errno == unix.EPERM { 749 // These errors are expected from hitting non-zero RLIMIT_MEMLOCK, or 750 // hitting zero RLIMIT_MEMLOCK without CAP_IPC_LOCK, respectively. 751 log.Infof("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: mlock failed: %s", errno) 752 } else { 753 log.Warningf("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: mlock failed: %s", errno) 754 } 755 mlockDisabled.Store(1) 756 return false 757 } 758 return true 759 } 760 761 func tryPopulate(b safemem.Block) bool { 762 // There are two approaches for populating writable pages: 763 // 1. madvise(MADV_POPULATE_WRITE). It has the desired effect: "Populate 764 // (prefault) page tables writable, faulting in all pages in the range 765 // just as if manually writing to each each page". 766 // 2. Call mlock to populate pages, then munlock to cancel the mlock (but 767 // keep the pages populated). 768 // 769 // Prefer the madvise(MADV_POPULATE_WRITE) approach because: 770 // - Only requires 1 syscall, as opposed to 2 syscalls with mlock approach. 771 // - It is faster because it doesn't have to modify vmas like mlock does. 772 // - It works for disk-backed memory mappings too. The mlock approach doesn't 773 // work for disk-backed filesystems (e.g. ext4). This is because 774 // mlock(2) => mm/gup.c:__mm_populate() emulates a read fault on writable 775 // MAP_SHARED mappings. For memory-backed (shmem) files, 776 // mm/mmap.c:vma_set_page_prot() => vma_wants_writenotify() is false, so 777 // the page table entries populated by a read fault are writable. For 778 // disk-backed files, vma_set_page_prot() => vma_wants_writenotify() is 779 // true, so the page table entries populated by a read fault are read-only. 780 if tryPopulateMadv(b) { 781 return true 782 } 783 return tryPopulateMlock(b) 784 } 785 786 // fallocate(2) modes, defined in Linux's include/uapi/linux/falloc.h. 787 const ( 788 _FALLOC_FL_KEEP_SIZE = 1 789 _FALLOC_FL_PUNCH_HOLE = 2 790 ) 791 792 // Decommit releases resources associated with maintaining the contents of the 793 // given pages. If Decommit succeeds, future accesses of the decommitted pages 794 // will read zeroes. 795 // 796 // Preconditions: fr.Length() > 0. 797 func (f *MemoryFile) Decommit(fr memmap.FileRange) error { 798 if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 { 799 panic(fmt.Sprintf("invalid range: %v", fr)) 800 } 801 802 if f.opts.ManualZeroing { 803 // FALLOC_FL_PUNCH_HOLE may not zero pages if ManualZeroing is in 804 // effect. 805 if err := f.manuallyZero(fr); err != nil { 806 return err 807 } 808 } else { 809 if err := f.decommitFile(fr); err != nil { 810 return err 811 } 812 } 813 814 f.markDecommitted(fr) 815 return nil 816 } 817 818 func (f *MemoryFile) manuallyZero(fr memmap.FileRange) error { 819 return f.forEachMappingSlice(fr, func(bs []byte) { 820 for i := range bs { 821 bs[i] = 0 822 } 823 }) 824 } 825 826 func (f *MemoryFile) commitFile(fr memmap.FileRange) error { 827 // "The default operation (i.e., mode is zero) of fallocate() allocates the 828 // disk space within the range specified by offset and len." - fallocate(2) 829 return unix.Fallocate( 830 int(f.file.Fd()), 831 0, // mode 832 int64(fr.Start), 833 int64(fr.Length())) 834 } 835 836 func (f *MemoryFile) decommitFile(fr memmap.FileRange) error { 837 // "After a successful call, subsequent reads from this range will 838 // return zeroes. The FALLOC_FL_PUNCH_HOLE flag must be ORed with 839 // FALLOC_FL_KEEP_SIZE in mode ..." - fallocate(2) 840 return unix.Fallocate( 841 int(f.file.Fd()), 842 _FALLOC_FL_PUNCH_HOLE|_FALLOC_FL_KEEP_SIZE, 843 int64(fr.Start), 844 int64(fr.Length())) 845 } 846 847 func (f *MemoryFile) markDecommitted(fr memmap.FileRange) { 848 f.mu.Lock() 849 defer f.mu.Unlock() 850 // Since we're changing the knownCommitted attribute, we need to merge 851 // across the entire range to ensure that the usage tree is minimal. 852 gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) { 853 val := seg.ValuePtr() 854 if val.knownCommitted { 855 // Drop the usageExpected appropriately. 856 amount := seg.Range().Length() 857 usage.MemoryAccounting.Dec(amount, val.kind, val.memCgID) 858 f.usageExpected -= amount 859 val.knownCommitted = false 860 } 861 val.memCgID = 0 862 }) 863 if gap.Ok() { 864 panic(fmt.Sprintf("Decommit(%v): attempted to decommit unallocated pages %v:\n%v", fr, gap.Range(), &f.usage)) 865 } 866 f.usage.MergeRange(fr) 867 } 868 869 // IncRef implements memmap.File.IncRef. 870 func (f *MemoryFile) IncRef(fr memmap.FileRange, memCgID uint32) { 871 if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 { 872 panic(fmt.Sprintf("invalid range: %v", fr)) 873 } 874 875 f.mu.Lock() 876 defer f.mu.Unlock() 877 878 gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) { 879 seg.ValuePtr().refs++ 880 }) 881 if gap.Ok() { 882 panic(fmt.Sprintf("IncRef(%v): attempted to IncRef on unallocated pages %v:\n%v", fr, gap.Range(), &f.usage)) 883 } 884 885 f.usage.MergeAdjacent(fr) 886 } 887 888 // DecRef implements memmap.File.DecRef. 889 func (f *MemoryFile) DecRef(fr memmap.FileRange) { 890 if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 { 891 panic(fmt.Sprintf("invalid range: %v", fr)) 892 } 893 894 var freed bool 895 896 f.mu.Lock() 897 defer f.mu.Unlock() 898 899 for seg := f.usage.FindSegment(fr.Start); seg.Ok() && seg.Start() < fr.End; seg = seg.NextSegment() { 900 seg = f.usage.Isolate(seg, fr) 901 val := seg.ValuePtr() 902 if val.refs == 0 { 903 panic(fmt.Sprintf("DecRef(%v): 0 existing references on %v:\n%v", fr, seg.Range(), &f.usage)) 904 } 905 val.refs-- 906 if val.refs == 0 { 907 f.reclaim.Add(seg.Range(), reclaimSetValue{}) 908 freed = true 909 // Reclassify memory as System, until it's freed by the reclaim 910 // goroutine. 911 if val.knownCommitted { 912 usage.MemoryAccounting.Move(seg.Range().Length(), usage.System, val.kind, val.memCgID) 913 } 914 val.kind = usage.System 915 } 916 } 917 f.usage.MergeAdjacent(fr) 918 919 if freed { 920 f.reclaimable = true 921 f.reclaimCond.Signal() 922 } 923 } 924 925 // MapInternal implements memmap.File.MapInternal. 926 func (f *MemoryFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { 927 if !fr.WellFormed() || fr.Length() == 0 { 928 panic(fmt.Sprintf("invalid range: %v", fr)) 929 } 930 if at.Execute { 931 return safemem.BlockSeq{}, linuxerr.EACCES 932 } 933 934 chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift) 935 if chunks == 1 { 936 // Avoid an unnecessary slice allocation. 937 var seq safemem.BlockSeq 938 err := f.forEachMappingSlice(fr, func(bs []byte) { 939 seq = safemem.BlockSeqOf(safemem.BlockFromSafeSlice(bs)) 940 }) 941 return seq, err 942 } 943 blocks := make([]safemem.Block, 0, chunks) 944 err := f.forEachMappingSlice(fr, func(bs []byte) { 945 blocks = append(blocks, safemem.BlockFromSafeSlice(bs)) 946 }) 947 return safemem.BlockSeqFromSlice(blocks), err 948 } 949 950 // forEachMappingSlice invokes fn on a sequence of byte slices that 951 // collectively map all bytes in fr. 952 func (f *MemoryFile) forEachMappingSlice(fr memmap.FileRange, fn func([]byte)) error { 953 mappings := f.mappings.Load().([]uintptr) 954 for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize { 955 chunk := int(chunkStart >> chunkShift) 956 m := atomic.LoadUintptr(&mappings[chunk]) 957 if m == 0 { 958 var err error 959 mappings, m, err = f.getChunkMapping(chunk) 960 if err != nil { 961 return err 962 } 963 } 964 startOff := uint64(0) 965 if chunkStart < fr.Start { 966 startOff = fr.Start - chunkStart 967 } 968 endOff := uint64(chunkSize) 969 if chunkStart+chunkSize > fr.End { 970 endOff = fr.End - chunkStart 971 } 972 fn(unsafeSlice(m, chunkSize)[startOff:endOff]) 973 } 974 return nil 975 } 976 977 func (f *MemoryFile) getChunkMapping(chunk int) ([]uintptr, uintptr, error) { 978 f.mappingsMu.Lock() 979 defer f.mappingsMu.Unlock() 980 // Another thread may have replaced f.mappings altogether due to file 981 // expansion. 982 mappings := f.mappings.Load().([]uintptr) 983 // Another thread may have already mapped the chunk. 984 if m := mappings[chunk]; m != 0 { 985 return mappings, m, nil 986 } 987 m, _, errno := unix.Syscall6( 988 unix.SYS_MMAP, 989 0, 990 chunkSize, 991 unix.PROT_READ|unix.PROT_WRITE, 992 unix.MAP_SHARED, 993 f.file.Fd(), 994 uintptr(chunk<<chunkShift)) 995 if errno != 0 { 996 return nil, 0, errno 997 } 998 atomic.StoreUintptr(&mappings[chunk], m) 999 return mappings, m, nil 1000 } 1001 1002 // MarkEvictable allows f to request memory deallocation by calling 1003 // user.Evict(er) in the future. 1004 // 1005 // Redundantly marking an already-evictable range as evictable has no effect. 1006 func (f *MemoryFile) MarkEvictable(user EvictableMemoryUser, er EvictableRange) { 1007 f.mu.Lock() 1008 defer f.mu.Unlock() 1009 info, ok := f.evictable[user] 1010 if !ok { 1011 info = &evictableMemoryUserInfo{} 1012 f.evictable[user] = info 1013 } 1014 gap := info.ranges.LowerBoundGap(er.Start) 1015 for gap.Ok() && gap.Start() < er.End { 1016 gapER := gap.Range().Intersect(er) 1017 if gapER.Length() == 0 { 1018 gap = gap.NextGap() 1019 continue 1020 } 1021 gap = info.ranges.Insert(gap, gapER, evictableRangeSetValue{}).NextGap() 1022 } 1023 if !info.evicting { 1024 switch f.opts.DelayedEviction { 1025 case DelayedEvictionDisabled: 1026 // Kick off eviction immediately. 1027 f.startEvictionGoroutineLocked(user, info) 1028 case DelayedEvictionEnabled: 1029 if !f.opts.UseHostMemcgPressure { 1030 // Ensure that the reclaimer goroutine is running, so that it 1031 // can start eviction when necessary. 1032 f.reclaimCond.Signal() 1033 } 1034 } 1035 } 1036 } 1037 1038 // MarkUnevictable informs f that user no longer considers er to be evictable, 1039 // so the MemoryFile should no longer call user.Evict(er). Note that, per 1040 // EvictableMemoryUser.Evict's documentation, user.Evict(er) may still be 1041 // called even after MarkUnevictable returns due to race conditions, and 1042 // implementations of EvictableMemoryUser must handle this possibility. 1043 // 1044 // Redundantly marking an already-unevictable range as unevictable has no 1045 // effect. 1046 func (f *MemoryFile) MarkUnevictable(user EvictableMemoryUser, er EvictableRange) { 1047 f.mu.Lock() 1048 defer f.mu.Unlock() 1049 info, ok := f.evictable[user] 1050 if !ok { 1051 return 1052 } 1053 seg := info.ranges.LowerBoundSegment(er.Start) 1054 for seg.Ok() && seg.Start() < er.End { 1055 seg = info.ranges.Isolate(seg, er) 1056 seg = info.ranges.Remove(seg).NextSegment() 1057 } 1058 // We can only remove info if there's no eviction goroutine running on its 1059 // behalf. 1060 if !info.evicting && info.ranges.IsEmpty() { 1061 delete(f.evictable, user) 1062 } 1063 } 1064 1065 // MarkAllUnevictable informs f that user no longer considers any offsets to be 1066 // evictable. It otherwise has the same semantics as MarkUnevictable. 1067 func (f *MemoryFile) MarkAllUnevictable(user EvictableMemoryUser) { 1068 f.mu.Lock() 1069 defer f.mu.Unlock() 1070 info, ok := f.evictable[user] 1071 if !ok { 1072 return 1073 } 1074 info.ranges.RemoveAll() 1075 // We can only remove info if there's no eviction goroutine running on its 1076 // behalf. 1077 if !info.evicting { 1078 delete(f.evictable, user) 1079 } 1080 } 1081 1082 // ShouldCacheEvictable returns true if f is meaningfully delaying evictions of 1083 // evictable memory, such that it may be advantageous to cache data in 1084 // evictable memory. The value returned by ShouldCacheEvictable may change 1085 // between calls. 1086 func (f *MemoryFile) ShouldCacheEvictable() bool { 1087 return f.opts.DelayedEviction == DelayedEvictionManual || f.opts.UseHostMemcgPressure 1088 } 1089 1090 // UpdateUsage ensures that the memory usage statistics in 1091 // usage.MemoryAccounting are up to date. If forceScan is true, the 1092 // UsageScanDuration is ignored and the memory file is scanned to get the 1093 // memory usage. 1094 func (f *MemoryFile) UpdateUsage(memCgID uint32) error { 1095 f.mu.Lock() 1096 defer f.mu.Unlock() 1097 1098 // If the underlying usage matches where the usage tree already 1099 // represents, then we can just avoid the entire scan (we know it's 1100 // accurate). 1101 currentUsage, err := f.TotalUsage() 1102 if err != nil { 1103 return err 1104 } 1105 if currentUsage == f.usageExpected && f.usageSwapped == 0 { 1106 log.Debugf("UpdateUsage: skipped with usageSwapped=0.") 1107 return nil 1108 } 1109 // If the current usage matches the expected but there's swap 1110 // accounting, then ensure a scan takes place at least every second 1111 // (when requested). 1112 if currentUsage == f.usageExpected+f.usageSwapped && time.Now().Before(f.usageLast.Add(time.Second)) { 1113 log.Debugf("UpdateUsage: skipped with usageSwapped!=0.") 1114 return nil 1115 } 1116 1117 // Linux updates usage values at CONFIG_HZ. 1118 if scanningAfter := time.Now().Sub(f.usageLast).Milliseconds(); scanningAfter < time.Second.Milliseconds()/linux.CLOCKS_PER_SEC { 1119 log.Debugf("UpdateUsage: skipped because previous scan happened %d ms back", scanningAfter) 1120 return nil 1121 } 1122 1123 if memCgID == 0 { 1124 f.usageLast = time.Now() 1125 } 1126 err = f.updateUsageLocked(currentUsage, memCgID, mincore) 1127 log.Debugf("UpdateUsage: currentUsage=%d, usageExpected=%d, usageSwapped=%d.", 1128 currentUsage, f.usageExpected, f.usageSwapped) 1129 log.Debugf("UpdateUsage: took %v.", time.Since(f.usageLast)) 1130 return err 1131 } 1132 1133 // updateUsageLocked attempts to detect commitment of previous-uncommitted 1134 // pages by invoking checkCommitted, which is a function that, for each page i 1135 // in bs, sets committed[i] to 1 if the page is committed and 0 otherwise. 1136 // 1137 // Precondition: f.mu must be held; it may be unlocked and reacquired. 1138 // +checklocks:f.mu 1139 func (f *MemoryFile) updateUsageLocked(currentUsage uint64, memCgID uint32, checkCommitted func(bs []byte, committed []byte) error) error { 1140 // Track if anything changed to elide the merge. In the common case, we 1141 // expect all segments to be committed and no merge to occur. 1142 changedAny := false 1143 defer func() { 1144 if changedAny { 1145 f.usage.MergeAll() 1146 } 1147 1148 // Adjust the swap usage to reflect reality. 1149 if f.usageExpected < currentUsage { 1150 // Since no pages may be marked decommitted while we hold mu, we 1151 // know that usage may have only increased since we got the last 1152 // current usage. Therefore, if usageExpected is still short of 1153 // currentUsage, we must assume that the difference is in pages 1154 // that have been swapped. 1155 newUsageSwapped := currentUsage - f.usageExpected 1156 if f.usageSwapped < newUsageSwapped { 1157 usage.MemoryAccounting.Inc(newUsageSwapped-f.usageSwapped, usage.System, 0) 1158 } else { 1159 usage.MemoryAccounting.Dec(f.usageSwapped-newUsageSwapped, usage.System, 0) 1160 } 1161 f.usageSwapped = newUsageSwapped 1162 } else if f.usageSwapped != 0 { 1163 // We have more usage accounted for than the file itself. 1164 // That's fine, we probably caught a race where pages were 1165 // being committed while the below loop was running. Just 1166 // report the higher number that we found and ignore swap. 1167 usage.MemoryAccounting.Dec(f.usageSwapped, usage.System, 0) 1168 f.usageSwapped = 0 1169 } 1170 }() 1171 1172 // Reused mincore buffer, will generally be <= 4096 bytes. 1173 var buf []byte 1174 1175 // Iterate over all usage data. There will only be usage segments 1176 // present when there is an associated reference. 1177 for seg := f.usage.FirstSegment(); seg.Ok(); { 1178 if !seg.ValuePtr().canCommit() { 1179 seg = seg.NextSegment() 1180 continue 1181 } 1182 1183 // Scan the pages of the given memCgID only. This will avoid scanning the 1184 // whole memory file when the memory usage is required only for a specific 1185 // cgroup. The total memory usage of all cgroups can be obtained when the 1186 // memCgID is passed as zero. 1187 if memCgID != 0 && seg.ValuePtr().memCgID != memCgID { 1188 seg = seg.NextSegment() 1189 continue 1190 } 1191 1192 // Get the range for this segment. As we touch slices, the 1193 // Start value will be walked along. 1194 r := seg.Range() 1195 1196 var checkErr error 1197 err := f.forEachMappingSlice(r, 1198 func(s []byte) { 1199 if checkErr != nil { 1200 return 1201 } 1202 1203 // Ensure that we have sufficient buffer for the call 1204 // (one byte per page). The length of each slice must 1205 // be page-aligned. 1206 bufLen := len(s) / hostarch.PageSize 1207 if len(buf) < bufLen { 1208 buf = make([]byte, bufLen) 1209 } 1210 1211 // Query for new pages in core. 1212 // NOTE(b/165896008): mincore (which is passed as checkCommitted) 1213 // by f.UpdateUsage() might take a really long time. So unlock f.mu 1214 // while checkCommitted runs. 1215 f.mu.Unlock() // +checklocksforce 1216 err := checkCommitted(s, buf) 1217 f.mu.Lock() 1218 if err != nil { 1219 checkErr = err 1220 return 1221 } 1222 1223 // Scan each page and switch out segments. 1224 seg := f.usage.LowerBoundSegment(r.Start) 1225 for i := 0; i < bufLen; { 1226 if buf[i]&0x1 == 0 { 1227 i++ 1228 continue 1229 } 1230 // Scan to the end of this committed range. 1231 j := i + 1 1232 for ; j < bufLen; j++ { 1233 if buf[j]&0x1 == 0 { 1234 break 1235 } 1236 } 1237 committedFR := memmap.FileRange{ 1238 Start: r.Start + uint64(i*hostarch.PageSize), 1239 End: r.Start + uint64(j*hostarch.PageSize), 1240 } 1241 // Advance seg to committedFR.Start. 1242 for seg.Ok() && seg.End() < committedFR.Start { 1243 seg = seg.NextSegment() 1244 } 1245 // Mark pages overlapping committedFR as committed. 1246 for seg.Ok() && seg.Start() < committedFR.End { 1247 if seg.ValuePtr().canCommit() { 1248 seg = f.usage.Isolate(seg, committedFR) 1249 seg.ValuePtr().knownCommitted = true 1250 amount := seg.Range().Length() 1251 usage.MemoryAccounting.Inc(amount, seg.ValuePtr().kind, seg.ValuePtr().memCgID) 1252 f.usageExpected += amount 1253 changedAny = true 1254 } 1255 seg = seg.NextSegment() 1256 } 1257 // Continue scanning for committed pages. 1258 i = j + 1 1259 } 1260 1261 // Advance r.Start. 1262 r.Start += uint64(len(s)) 1263 }) 1264 if checkErr != nil { 1265 return checkErr 1266 } 1267 if err != nil { 1268 return err 1269 } 1270 1271 // Continue with the first segment after r.End. 1272 seg = f.usage.LowerBoundSegment(r.End) 1273 } 1274 1275 return nil 1276 } 1277 1278 // TotalUsage returns an aggregate usage for all memory statistics except 1279 // Mapped (which is external to MemoryFile). This is generally much cheaper 1280 // than UpdateUsage, but will not provide a fine-grained breakdown. 1281 func (f *MemoryFile) TotalUsage() (uint64, error) { 1282 // Stat the underlying file to discover the underlying usage. stat(2) 1283 // always reports the allocated block count in units of 512 bytes. This 1284 // includes pages in the page cache and swapped pages. 1285 var stat unix.Stat_t 1286 if err := unix.Fstat(int(f.file.Fd()), &stat); err != nil { 1287 return 0, err 1288 } 1289 return uint64(stat.Blocks * 512), nil 1290 } 1291 1292 // TotalSize returns the current size of the backing file in bytes, which is an 1293 // upper bound on the amount of memory that can currently be allocated from the 1294 // MemoryFile. The value returned by TotalSize is permitted to change. 1295 func (f *MemoryFile) TotalSize() uint64 { 1296 f.mu.Lock() 1297 defer f.mu.Unlock() 1298 return uint64(f.fileSize) 1299 } 1300 1301 // File returns the backing file. 1302 func (f *MemoryFile) File() *os.File { 1303 return f.file 1304 } 1305 1306 // FD implements memmap.File.FD. 1307 func (f *MemoryFile) FD() int { 1308 return int(f.file.Fd()) 1309 } 1310 1311 // IsDiskBacked returns true if f is backed by a file on disk. 1312 func (f *MemoryFile) IsDiskBacked() bool { 1313 return f.opts.DiskBackedFile 1314 } 1315 1316 // String implements fmt.Stringer.String. 1317 // 1318 // Note that because f.String locks f.mu, calling f.String internally 1319 // (including indirectly through the fmt package) risks recursive locking. 1320 // Within the pgalloc package, use f.usage directly instead. 1321 func (f *MemoryFile) String() string { 1322 f.mu.Lock() 1323 defer f.mu.Unlock() 1324 return f.usage.String() 1325 } 1326 1327 // runReclaim implements the reclaimer goroutine, which continuously decommits 1328 // reclaimable pages in order to reduce memory usage and make them available 1329 // for allocation. 1330 func (f *MemoryFile) runReclaim() { 1331 for { 1332 // N.B. We must call f.markReclaimed on the returned FrameRange. 1333 fr, ok := f.findReclaimable() 1334 if !ok { 1335 break 1336 } 1337 1338 if f.opts.ManualZeroing { 1339 // If ManualZeroing is in effect, only hugepage-aligned regions may 1340 // be safely passed to decommitFile. Pages will be zeroed on 1341 // reallocation, so we don't need to perform any manual zeroing 1342 // here, whether or not decommitFile succeeds. 1343 if startAddr, ok := hostarch.Addr(fr.Start).HugeRoundUp(); ok { 1344 if endAddr := hostarch.Addr(fr.End).HugeRoundDown(); startAddr < endAddr { 1345 decommitFR := memmap.FileRange{uint64(startAddr), uint64(endAddr)} 1346 if err := f.decommitFile(decommitFR); err != nil { 1347 log.Warningf("Reclaim failed to decommit %v: %v", decommitFR, err) 1348 } 1349 } 1350 } 1351 } else { 1352 if err := f.decommitFile(fr); err != nil { 1353 log.Warningf("Reclaim failed to decommit %v: %v", fr, err) 1354 // Zero the pages manually. This won't reduce memory usage, but at 1355 // least ensures that the pages will be zero when reallocated. 1356 if err := f.manuallyZero(fr); err != nil { 1357 panic(fmt.Sprintf("Reclaim failed to decommit or zero %v: %v", fr, err)) 1358 } 1359 } 1360 } 1361 f.markDecommitted(fr) 1362 f.markReclaimed(fr) 1363 } 1364 1365 // We only get here if findReclaimable finds f.destroyed set and returns 1366 // false. 1367 f.mu.Lock() 1368 if !f.destroyed { 1369 f.mu.Unlock() 1370 panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set") 1371 } 1372 if f.opts.DecommitOnDestroy && f.fileSize > 0 { 1373 if err := f.decommitFile(memmap.FileRange{Start: 0, End: uint64(f.fileSize)}); err != nil { 1374 f.mu.Unlock() 1375 panic(fmt.Sprintf("failed to decommit entire memory file during destruction: %v", err)) 1376 } 1377 } 1378 f.file.Close() 1379 // Ensure that any attempts to use f.file.Fd() fail instead of getting a fd 1380 // that has possibly been reassigned. 1381 f.file = nil 1382 f.mappingsMu.Lock() 1383 defer f.mappingsMu.Unlock() 1384 mappings := f.mappings.Load().([]uintptr) 1385 for i, m := range mappings { 1386 if m != 0 { 1387 _, _, errno := unix.Syscall(unix.SYS_MUNMAP, m, chunkSize, 0) 1388 if errno != 0 { 1389 log.Warningf("Failed to unmap mapping %#x for MemoryFile chunk %d: %v", m, i, errno) 1390 } 1391 } 1392 } 1393 // Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.) 1394 f.mappings.Store([]uintptr{}) 1395 f.mu.Unlock() 1396 1397 // This must be called without holding f.mu to avoid circular lock 1398 // ordering. 1399 if f.stopNotifyPressure != nil { 1400 f.stopNotifyPressure() 1401 } 1402 } 1403 1404 // findReclaimable finds memory that has been marked for reclaim. 1405 // 1406 // Note that there returned range will be removed from tracking. It 1407 // must be reclaimed (removed from f.usage) at this point. 1408 func (f *MemoryFile) findReclaimable() (memmap.FileRange, bool) { 1409 f.mu.Lock() 1410 defer f.mu.Unlock() 1411 for { 1412 for { 1413 if f.destroyed { 1414 return memmap.FileRange{}, false 1415 } 1416 if f.reclaimable { 1417 break 1418 } 1419 if f.opts.DelayedEviction == DelayedEvictionEnabled && !f.opts.UseHostMemcgPressure { 1420 // No work to do. Evict any pending evictable allocations to 1421 // get more reclaimable pages before going to sleep. 1422 f.startEvictionsLocked() 1423 } 1424 f.reclaimCond.Wait() 1425 } 1426 // Most allocations are done upwards, with exceptions being stacks and some 1427 // allocators that allocate top-down. Reclaim preserves this order to 1428 // minimize the cost of the search. 1429 if seg := f.reclaim.FirstSegment(); seg.Ok() { 1430 fr := seg.Range() 1431 f.reclaim.Remove(seg) 1432 return fr, true 1433 } 1434 // Nothing is reclaimable. 1435 f.reclaimable = false 1436 } 1437 } 1438 1439 func (f *MemoryFile) markReclaimed(fr memmap.FileRange) { 1440 f.mu.Lock() 1441 defer f.mu.Unlock() 1442 seg := f.usage.FindSegment(fr.Start) 1443 // All of fr should be mapped to a single uncommitted reclaimable 1444 // segment accounted to System. 1445 if !seg.Ok() { 1446 panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage)) 1447 } 1448 if !seg.Range().IsSupersetOf(fr) { 1449 panic(fmt.Sprintf("reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage)) 1450 } 1451 if got, want := seg.Value(), (usageInfo{ 1452 kind: usage.System, 1453 knownCommitted: false, 1454 refs: 0, 1455 memCgID: 0, 1456 }); got != want { 1457 panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage)) 1458 } 1459 // Deallocate reclaimed pages. Even though all of seg is reclaimable, 1460 // the caller of markReclaimed may not have decommitted it, so we can 1461 // only mark fr as reclaimed. 1462 f.usage.Remove(f.usage.Isolate(seg, fr)) 1463 } 1464 1465 // StartEvictions requests that f evict all evictable allocations. It does not 1466 // wait for eviction to complete; for this, see MemoryFile.WaitForEvictions. 1467 func (f *MemoryFile) StartEvictions() { 1468 f.mu.Lock() 1469 defer f.mu.Unlock() 1470 f.startEvictionsLocked() 1471 } 1472 1473 // Preconditions: f.mu must be locked. 1474 func (f *MemoryFile) startEvictionsLocked() bool { 1475 startedAny := false 1476 for user, info := range f.evictable { 1477 // Don't start multiple goroutines to evict the same user's 1478 // allocations. 1479 if !info.evicting { 1480 f.startEvictionGoroutineLocked(user, info) 1481 startedAny = true 1482 } 1483 } 1484 return startedAny 1485 } 1486 1487 // Preconditions: 1488 // - info == f.evictable[user]. 1489 // - !info.evicting. 1490 // - f.mu must be locked. 1491 func (f *MemoryFile) startEvictionGoroutineLocked(user EvictableMemoryUser, info *evictableMemoryUserInfo) { 1492 info.evicting = true 1493 f.evictionWG.Add(1) 1494 go func() { // S/R-SAFE: f.evictionWG 1495 defer f.evictionWG.Done() 1496 for { 1497 f.mu.Lock() 1498 info, ok := f.evictable[user] 1499 if !ok { 1500 // This shouldn't happen: only this goroutine is permitted 1501 // to delete this entry. 1502 f.mu.Unlock() 1503 panic(fmt.Sprintf("evictableMemoryUserInfo for EvictableMemoryUser %v deleted while eviction goroutine running", user)) 1504 } 1505 if info.ranges.IsEmpty() { 1506 delete(f.evictable, user) 1507 f.mu.Unlock() 1508 return 1509 } 1510 // Evict from the end of info.ranges, under the assumption that 1511 // if ranges in user start being used again (and are 1512 // consequently marked unevictable), such uses are more likely 1513 // to start from the beginning of user. 1514 seg := info.ranges.LastSegment() 1515 er := seg.Range() 1516 info.ranges.Remove(seg) 1517 // user.Evict() must be called without holding f.mu to avoid 1518 // circular lock ordering. 1519 f.mu.Unlock() 1520 user.Evict(context.Background(), er) 1521 } 1522 }() 1523 } 1524 1525 // WaitForEvictions blocks until f is no longer evicting any evictable 1526 // allocations. 1527 func (f *MemoryFile) WaitForEvictions() { 1528 f.evictionWG.Wait() 1529 } 1530 1531 type usageSetFunctions struct{} 1532 1533 func (usageSetFunctions) MinKey() uint64 { 1534 return 0 1535 } 1536 1537 func (usageSetFunctions) MaxKey() uint64 { 1538 return math.MaxUint64 1539 } 1540 1541 func (usageSetFunctions) ClearValue(val *usageInfo) { 1542 } 1543 1544 func (usageSetFunctions) Merge(_ memmap.FileRange, val1 usageInfo, _ memmap.FileRange, val2 usageInfo) (usageInfo, bool) { 1545 return val1, val1 == val2 1546 } 1547 1548 func (usageSetFunctions) Split(_ memmap.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) { 1549 return val, val 1550 } 1551 1552 // evictableRangeSetValue is the value type of evictableRangeSet. 1553 type evictableRangeSetValue struct{} 1554 1555 type evictableRangeSetFunctions struct{} 1556 1557 func (evictableRangeSetFunctions) MinKey() uint64 { 1558 return 0 1559 } 1560 1561 func (evictableRangeSetFunctions) MaxKey() uint64 { 1562 return math.MaxUint64 1563 } 1564 1565 func (evictableRangeSetFunctions) ClearValue(val *evictableRangeSetValue) { 1566 } 1567 1568 func (evictableRangeSetFunctions) Merge(_ EvictableRange, _ evictableRangeSetValue, _ EvictableRange, _ evictableRangeSetValue) (evictableRangeSetValue, bool) { 1569 return evictableRangeSetValue{}, true 1570 } 1571 1572 func (evictableRangeSetFunctions) Split(_ EvictableRange, _ evictableRangeSetValue, _ uint64) (evictableRangeSetValue, evictableRangeSetValue) { 1573 return evictableRangeSetValue{}, evictableRangeSetValue{} 1574 } 1575 1576 // reclaimSetValue is the value type of reclaimSet. 1577 type reclaimSetValue struct{} 1578 1579 type reclaimSetFunctions struct{} 1580 1581 func (reclaimSetFunctions) MinKey() uint64 { 1582 return 0 1583 } 1584 1585 func (reclaimSetFunctions) MaxKey() uint64 { 1586 return math.MaxUint64 1587 } 1588 1589 func (reclaimSetFunctions) ClearValue(val *reclaimSetValue) { 1590 } 1591 1592 func (reclaimSetFunctions) Merge(_ memmap.FileRange, _ reclaimSetValue, _ memmap.FileRange, _ reclaimSetValue) (reclaimSetValue, bool) { 1593 return reclaimSetValue{}, true 1594 } 1595 1596 func (reclaimSetFunctions) Split(_ memmap.FileRange, _ reclaimSetValue, _ uint64) (reclaimSetValue, reclaimSetValue) { 1597 return reclaimSetValue{}, reclaimSetValue{} 1598 }