github.com/ttpreport/gvisor-ligolo@v0.0.0-20240123134145-a858404967ba/pkg/sentry/pgalloc/pgalloc.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package pgalloc contains the page allocator subsystem, which manages memory 16 // that may be mapped into application address spaces. 17 // 18 // Lock order: 19 // 20 // pgalloc.MemoryFile.mu 21 // pgalloc.MemoryFile.mappingsMu 22 package pgalloc 23 24 import ( 25 "fmt" 26 "math" 27 "os" 28 "sync/atomic" 29 "time" 30 31 "github.com/ttpreport/gvisor-ligolo/pkg/abi/linux" 32 "github.com/ttpreport/gvisor-ligolo/pkg/atomicbitops" 33 "github.com/ttpreport/gvisor-ligolo/pkg/context" 34 "github.com/ttpreport/gvisor-ligolo/pkg/errors/linuxerr" 35 "github.com/ttpreport/gvisor-ligolo/pkg/hostarch" 36 "github.com/ttpreport/gvisor-ligolo/pkg/log" 37 "github.com/ttpreport/gvisor-ligolo/pkg/safemem" 38 "github.com/ttpreport/gvisor-ligolo/pkg/sentry/hostmm" 39 "github.com/ttpreport/gvisor-ligolo/pkg/sentry/memmap" 40 "github.com/ttpreport/gvisor-ligolo/pkg/sentry/usage" 41 "github.com/ttpreport/gvisor-ligolo/pkg/sync" 42 "golang.org/x/sys/unix" 43 ) 44 45 // Direction describes how to allocate offsets from MemoryFile. 46 type Direction int 47 48 const ( 49 // BottomUp allocates offsets in increasing offsets. 50 BottomUp Direction = iota 51 // TopDown allocates offsets in decreasing offsets. 52 TopDown 53 ) 54 55 // String implements fmt.Stringer. 56 func (d Direction) String() string { 57 switch d { 58 case BottomUp: 59 return "up" 60 case TopDown: 61 return "down" 62 } 63 panic(fmt.Sprintf("invalid direction: %d", d)) 64 } 65 66 // MemoryFile is a memmap.File whose pages may be allocated to arbitrary 67 // users. 68 type MemoryFile struct { 69 // opts holds options passed to NewMemoryFile. opts is immutable. 70 opts MemoryFileOpts 71 72 // MemoryFile owns a single backing file, which is modeled as follows: 73 // 74 // Each page in the file can be committed or uncommitted. A page is 75 // committed if the host kernel is spending resources to store its contents 76 // and uncommitted otherwise. This definition includes pages that the host 77 // kernel has swapped; this is intentional, to ensure that accounting does 78 // not change even if host kernel swapping behavior changes, and that 79 // memory used by pseudo-swap mechanisms like zswap is still accounted. 80 // 81 // The initial contents of uncommitted pages are implicitly zero bytes. A 82 // read or write to the contents of an uncommitted page causes it to be 83 // committed. This is the only event that can cause a uncommitted page to 84 // be committed. 85 // 86 // fallocate(FALLOC_FL_PUNCH_HOLE) (MemoryFile.Decommit) causes committed 87 // pages to be uncommitted. This is the only event that can cause a 88 // committed page to be uncommitted. 89 // 90 // Memory accounting is based on identifying the set of committed pages. 91 // Since we do not have direct access to the MMU, tracking reads and writes 92 // to uncommitted pages to detect commitment would introduce additional 93 // page faults, which would be prohibitively expensive. Instead, we query 94 // the host kernel to determine which pages are committed. 95 96 // file is the backing file. The file pointer is immutable. 97 file *os.File 98 99 mu memoryFileMutex 100 101 // usage maps each page in the file to metadata for that page. Pages for 102 // which no segment exists in usage are both unallocated (not in use) and 103 // uncommitted. 104 // 105 // Since usage stores usageInfo objects by value, clients should usually 106 // use usageIterator.ValuePtr() instead of usageIterator.Value() to get a 107 // pointer to the usageInfo rather than a copy. 108 // 109 // usage must be kept maximally merged (that is, there should never be two 110 // adjacent segments with the same values). At least markReclaimed depends 111 // on this property. 112 // 113 // usage is protected by mu. 114 usage usageSet 115 116 // The UpdateUsage function scans all segments with knownCommitted set 117 // to false, sees which pages are committed and creates corresponding 118 // segments with knownCommitted set to true. 119 // 120 // In order to avoid unnecessary scans, usageExpected tracks the total 121 // file blocks expected. This is used to elide the scan when this 122 // matches the underlying file blocks. 123 // 124 // To track swapped pages, usageSwapped tracks the discrepency between 125 // what is observed in core and what is reported by the file. When 126 // usageSwapped is non-zero, a sweep will be performed at least every 127 // second. The start of the last sweep is recorded in usageLast. 128 // 129 // All usage attributes are all protected by mu. 130 usageExpected uint64 131 usageSwapped uint64 132 usageLast time.Time 133 134 // fileSize is the size of the backing memory file in bytes. fileSize is 135 // always a power-of-two multiple of chunkSize. 136 // 137 // fileSize is protected by mu. 138 fileSize int64 139 140 // Pages from the backing file are mapped into the local address space on 141 // the granularity of large pieces called chunks. mappings is a []uintptr 142 // that stores, for each chunk, the start address of a mapping of that 143 // chunk in the current process' address space, or 0 if no such mapping 144 // exists. Once a chunk is mapped, it is never remapped or unmapped until 145 // the MemoryFile is destroyed. 146 // 147 // Mutating the mappings slice or its contents requires both holding 148 // mappingsMu and using atomic memory operations. (The slice is mutated 149 // whenever the file is expanded. Per the above, the only permitted 150 // mutation of the slice's contents is the assignment of a mapping to a 151 // chunk that was previously unmapped.) Reading the slice or its contents 152 // only requires *either* holding mappingsMu or using atomic memory 153 // operations. This allows MemoryFile.MapInternal to avoid locking in the 154 // common case where chunk mappings already exist. 155 mappingsMu mappingsMutex 156 mappings atomic.Value 157 158 // destroyed is set by Destroy to instruct the reclaimer goroutine to 159 // release resources and exit. destroyed is protected by mu. 160 destroyed bool 161 162 // reclaimable is true if usage may contain reclaimable pages. reclaimable 163 // is protected by mu. 164 reclaimable bool 165 166 // reclaim is the collection of regions for reclaim. reclaim is protected 167 // by mu. 168 reclaim reclaimSet 169 170 // reclaimCond is signaled (with mu locked) when reclaimable or destroyed 171 // transitions from false to true. 172 reclaimCond sync.Cond 173 174 // evictable maps EvictableMemoryUsers to eviction state. 175 // 176 // evictable is protected by mu. 177 evictable map[EvictableMemoryUser]*evictableMemoryUserInfo 178 179 // evictionWG counts the number of goroutines currently performing evictions. 180 evictionWG sync.WaitGroup 181 182 // stopNotifyPressure stops memory cgroup pressure level 183 // notifications used to drive eviction. stopNotifyPressure is 184 // immutable. 185 stopNotifyPressure func() 186 } 187 188 // MemoryFileOpts provides options to NewMemoryFile. 189 type MemoryFileOpts struct { 190 // DelayedEviction controls the extent to which the MemoryFile may delay 191 // eviction of evictable allocations. 192 DelayedEviction DelayedEvictionType 193 194 // If UseHostMemcgPressure is true, use host memory cgroup pressure level 195 // notifications to determine when eviction is necessary. This option has 196 // no effect unless DelayedEviction is DelayedEvictionEnabled. 197 UseHostMemcgPressure bool 198 199 // DecommitOnDestroy indicates whether the entire host file should be 200 // decommitted on destruction. This is appropriate for host filesystem based 201 // files that need to be explicitly cleaned up to release disk space. 202 DecommitOnDestroy bool 203 204 // If ManualZeroing is true, MemoryFile must not assume that new pages 205 // obtained from the host are zero-filled, such that MemoryFile must manually 206 // zero newly-allocated pages. 207 ManualZeroing bool 208 209 // If DisableIMAWorkAround is true, NewMemoryFile will not call 210 // IMAWorkAroundForMemFile(). 211 DisableIMAWorkAround bool 212 213 // DiskBackedFile indicates that the MemoryFile is backed by a file on disk. 214 DiskBackedFile bool 215 } 216 217 // DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction. 218 type DelayedEvictionType int 219 220 const ( 221 // DelayedEvictionDefault has unspecified behavior. 222 DelayedEvictionDefault DelayedEvictionType = iota 223 224 // DelayedEvictionDisabled requires that evictable allocations are evicted 225 // as soon as possible. 226 DelayedEvictionDisabled 227 228 // DelayedEvictionEnabled requests that the MemoryFile delay eviction of 229 // evictable allocations until doing so is considered necessary to avoid 230 // performance degradation due to host memory pressure, or OOM kills. 231 // 232 // As of this writing, the behavior of DelayedEvictionEnabled depends on 233 // whether or not MemoryFileOpts.UseHostMemcgPressure is enabled: 234 // 235 // - If UseHostMemcgPressure is true, evictions are delayed until memory 236 // pressure is indicated. 237 // 238 // - Otherwise, evictions are only delayed until the reclaimer goroutine 239 // is out of work (pages to reclaim). 240 DelayedEvictionEnabled 241 242 // DelayedEvictionManual requires that evictable allocations are only 243 // evicted when MemoryFile.StartEvictions() is called. This is extremely 244 // dangerous outside of tests. 245 DelayedEvictionManual 246 ) 247 248 // usageInfo tracks usage information. 249 // 250 // +stateify savable 251 type usageInfo struct { 252 // kind is the usage kind. 253 kind usage.MemoryKind 254 255 // knownCommitted is true if the tracked region is definitely committed. 256 // (If it is false, the tracked region may or may not be committed.) 257 knownCommitted bool 258 259 refs uint64 260 261 // memCgID is the memory cgroup id to which this page is committed. 262 memCgID uint32 263 } 264 265 // canCommit returns true if the tracked region can be committed. 266 func (u *usageInfo) canCommit() bool { 267 // refs must be greater than 0 because we assume that reclaimable pages 268 // (that aren't already known to be committed) are not committed. This 269 // isn't necessarily true, even after the reclaimer does Decommit(), 270 // because the kernel may subsequently back the hugepage-sized region 271 // containing the decommitted page with a hugepage. However, it's 272 // consistent with our treatment of unallocated pages, which have the same 273 // property. 274 return !u.knownCommitted && u.refs != 0 275 } 276 277 // An EvictableMemoryUser represents a user of MemoryFile-allocated memory that 278 // may be asked to deallocate that memory in the presence of memory pressure. 279 type EvictableMemoryUser interface { 280 // Evict requests that the EvictableMemoryUser deallocate memory used by 281 // er, which was registered as evictable by a previous call to 282 // MemoryFile.MarkEvictable. 283 // 284 // Evict is not required to deallocate memory. In particular, since pgalloc 285 // must call Evict without holding locks to avoid circular lock ordering, 286 // it is possible that the passed range has already been marked as 287 // unevictable by a racing call to MemoryFile.MarkUnevictable. 288 // Implementations of EvictableMemoryUser must detect such races and handle 289 // them by making Evict have no effect on unevictable ranges. 290 // 291 // After a call to Evict, the MemoryFile will consider the evicted range 292 // unevictable (i.e. it will not call Evict on the same range again) until 293 // informed otherwise by a subsequent call to MarkEvictable. 294 Evict(ctx context.Context, er EvictableRange) 295 } 296 297 // An EvictableRange represents a range of uint64 offsets in an 298 // EvictableMemoryUser. 299 // 300 // In practice, most EvictableMemoryUsers will probably be implementations of 301 // memmap.Mappable, and EvictableRange therefore corresponds to 302 // memmap.MappableRange. However, this package cannot depend on the memmap 303 // package, since doing so would create a circular dependency. 304 // 305 // type EvictableRange <generated using go_generics> 306 307 // evictableMemoryUserInfo is the value type of MemoryFile.evictable. 308 type evictableMemoryUserInfo struct { 309 // ranges tracks all evictable ranges for the given user. 310 ranges evictableRangeSet 311 312 // If evicting is true, there is a goroutine currently evicting all 313 // evictable ranges for this user. 314 evicting bool 315 } 316 317 const ( 318 chunkShift = 30 319 chunkSize = 1 << chunkShift // 1 GB 320 chunkMask = chunkSize - 1 321 322 // maxPage is the highest 64-bit page. 323 maxPage = math.MaxUint64 &^ (hostarch.PageSize - 1) 324 ) 325 326 // NewMemoryFile creates a MemoryFile backed by the given file. If 327 // NewMemoryFile succeeds, ownership of file is transferred to the returned 328 // MemoryFile. 329 func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) { 330 switch opts.DelayedEviction { 331 case DelayedEvictionDefault: 332 opts.DelayedEviction = DelayedEvictionEnabled 333 case DelayedEvictionDisabled, DelayedEvictionManual: 334 opts.UseHostMemcgPressure = false 335 case DelayedEvictionEnabled: 336 // ok 337 default: 338 return nil, fmt.Errorf("invalid MemoryFileOpts.DelayedEviction: %v", opts.DelayedEviction) 339 } 340 341 // Truncate the file to 0 bytes first to ensure that it's empty. 342 if err := file.Truncate(0); err != nil { 343 return nil, err 344 } 345 f := &MemoryFile{ 346 opts: opts, 347 file: file, 348 evictable: make(map[EvictableMemoryUser]*evictableMemoryUserInfo), 349 } 350 f.mappings.Store(make([]uintptr, 0)) 351 f.reclaimCond.L = &f.mu 352 353 if f.opts.DelayedEviction == DelayedEvictionEnabled && f.opts.UseHostMemcgPressure { 354 stop, err := hostmm.NotifyCurrentMemcgPressureCallback(func() { 355 f.mu.Lock() 356 startedAny := f.startEvictionsLocked() 357 f.mu.Unlock() 358 if startedAny { 359 log.Debugf("pgalloc.MemoryFile performing evictions due to memcg pressure") 360 } 361 }, "low") 362 if err != nil { 363 return nil, fmt.Errorf("failed to configure memcg pressure level notifications: %v", err) 364 } 365 f.stopNotifyPressure = stop 366 } 367 368 go f.runReclaim() // S/R-SAFE: f.mu 369 370 if !opts.DisableIMAWorkAround { 371 IMAWorkAroundForMemFile(file.Fd()) 372 } 373 return f, nil 374 } 375 376 // IMAWorkAroundForMemFile works around IMA by immediately creating a temporary 377 // PROT_EXEC mapping, while the backing file is still small. IMA will ignore 378 // any future mappings. 379 // 380 // The Linux kernel contains an optional feature called "Integrity 381 // Measurement Architecture" (IMA). If IMA is enabled, it will checksum 382 // binaries the first time they are mapped PROT_EXEC. This is bad news for 383 // executable pages mapped from our backing file, which can grow to 384 // terabytes in (sparse) size. If IMA attempts to checksum a file that 385 // large, it will allocate all of the sparse pages and quickly exhaust all 386 // memory. 387 func IMAWorkAroundForMemFile(fd uintptr) { 388 m, _, errno := unix.Syscall6( 389 unix.SYS_MMAP, 390 0, 391 hostarch.PageSize, 392 unix.PROT_EXEC, 393 unix.MAP_SHARED, 394 fd, 395 0) 396 if errno != 0 { 397 // This isn't fatal (IMA may not even be in use). Log the error, but 398 // don't return it. 399 log.Warningf("Failed to pre-map MemoryFile PROT_EXEC: %v", errno) 400 } else { 401 if _, _, errno := unix.Syscall( 402 unix.SYS_MUNMAP, 403 m, 404 hostarch.PageSize, 405 0); errno != 0 { 406 panic(fmt.Sprintf("failed to unmap PROT_EXEC MemoryFile mapping: %v", errno)) 407 } 408 } 409 } 410 411 // Destroy releases all resources used by f. 412 // 413 // Preconditions: All pages allocated by f have been freed. 414 // 415 // Postconditions: None of f's methods may be called after Destroy. 416 func (f *MemoryFile) Destroy() { 417 f.mu.Lock() 418 defer f.mu.Unlock() 419 f.destroyed = true 420 f.reclaimCond.Signal() 421 } 422 423 // AllocationMode provides a way to inform the pgalloc API how to allocate 424 // memory and pages on the host. 425 // A page will exist in one of the following incremental states: 426 // 1. Allocated: A page is allocated if it was returned by Allocate() and its 427 // reference count hasn't dropped to 0 since then. 428 // 2. Committed: As described in MemoryFile documentation above, a page is 429 // committed if the host kernel is spending resources to store its 430 // contents. A committed page is implicitly allocated. 431 // 3. Populated: A page is populated for reading/writing in a page table 432 // hierarchy if it has a page table entry that permits reading/writing 433 // respectively. A populated page is implicitly committed, since the page 434 // table entry needs a physical page to point to, but not vice versa. 435 type AllocationMode int 436 437 const ( 438 // AllocateOnly indicates that pages need to only be allocated. 439 AllocateOnly AllocationMode = iota 440 // AllocateAndCommit indicates that pages need to be committed, in addition 441 // to being allocated. 442 AllocateAndCommit 443 // AllocateAndWritePopulate indicates that writable pages should ideally be 444 // populated in the page table, in addition to being allocated. This is a 445 // suggestion, not a requirement. 446 AllocateAndWritePopulate 447 ) 448 449 // AllocOpts are options used in MemoryFile.Allocate. 450 type AllocOpts struct { 451 // Kind is the memory kind to be used for accounting. 452 Kind usage.MemoryKind 453 // Dir indicates the direction in which offsets are allocated. 454 Dir Direction 455 // MemCgID is the memory cgroup ID and the zero value indicates that 456 // the memory will not be accounted to any cgroup. 457 MemCgID uint32 458 // Mode allows the callers to select how the pages are allocated in the 459 // MemoryFile. Callers that will fill the allocated memory by writing to it 460 // should pass AllocateAndWritePopulate to avoid faulting page-by-page. Callers 461 // that will fill the allocated memory by invoking host system calls should 462 // pass AllocateOnly. 463 Mode AllocationMode 464 // If Reader is provided, the allocated memory is filled by calling 465 // ReadToBlocks() repeatedly until either length bytes are read or a non-nil 466 // error is returned. It returns the allocated memory, truncated down to the 467 // nearest page. If this is shorter than length bytes due to an error 468 // returned by ReadToBlocks(), it returns the partially filled fr and error. 469 Reader safemem.Reader 470 } 471 472 // Allocate returns a range of initially-zeroed pages of the given length with 473 // the given accounting kind and a single reference held by the caller. When 474 // the last reference on an allocated page is released, ownership of the page 475 // is returned to the MemoryFile, allowing it to be returned by a future call 476 // to Allocate. 477 // 478 // Preconditions: length must be page-aligned and non-zero. 479 func (f *MemoryFile) Allocate(length uint64, opts AllocOpts) (memmap.FileRange, error) { 480 fr, err := f.allocate(length, &opts) 481 if err != nil { 482 return memmap.FileRange{}, err 483 } 484 var dsts safemem.BlockSeq 485 switch opts.Mode { 486 case AllocateOnly: // Allocation is handled above. Nothing more to do. 487 case AllocateAndCommit: 488 if err := f.commitFile(fr); err != nil { 489 f.DecRef(fr) 490 return memmap.FileRange{}, err 491 } 492 case AllocateAndWritePopulate: 493 dsts, err = f.MapInternal(fr, hostarch.Write) 494 if err != nil { 495 f.DecRef(fr) 496 return memmap.FileRange{}, err 497 } 498 if canPopulate() { 499 rem := dsts 500 for { 501 if !tryPopulate(rem.Head()) { 502 break 503 } 504 rem = rem.Tail() 505 if rem.IsEmpty() { 506 break 507 } 508 } 509 } 510 default: 511 panic(fmt.Sprintf("unknown allocation mode: %d", opts.Mode)) 512 } 513 if opts.Reader != nil { 514 if dsts.IsEmpty() { 515 dsts, err = f.MapInternal(fr, hostarch.Write) 516 if err != nil { 517 f.DecRef(fr) 518 return memmap.FileRange{}, err 519 } 520 } 521 n, err := safemem.ReadFullToBlocks(opts.Reader, dsts) 522 un := uint64(hostarch.Addr(n).RoundDown()) 523 if un < length { 524 // Free unused memory and update fr to contain only the memory that is 525 // still allocated. 526 f.DecRef(memmap.FileRange{fr.Start + un, fr.End}) 527 fr.End = fr.Start + un 528 } 529 if err != nil { 530 return fr, err 531 } 532 } 533 return fr, nil 534 } 535 536 func (f *MemoryFile) allocate(length uint64, opts *AllocOpts) (memmap.FileRange, error) { 537 if length == 0 || length%hostarch.PageSize != 0 { 538 panic(fmt.Sprintf("invalid allocation length: %#x", length)) 539 } 540 541 f.mu.Lock() 542 defer f.mu.Unlock() 543 544 // Align hugepage-and-larger allocations on hugepage boundaries to try 545 // to take advantage of hugetmpfs. 546 alignment := uint64(hostarch.PageSize) 547 if length >= hostarch.HugePageSize { 548 alignment = hostarch.HugePageSize 549 } 550 551 // Find a range in the underlying file. 552 fr, ok := f.findAvailableRange(length, alignment, opts.Dir) 553 if !ok { 554 return memmap.FileRange{}, linuxerr.ENOMEM 555 } 556 557 // Expand the file if needed. 558 if int64(fr.End) > f.fileSize { 559 // Round the new file size up to be chunk-aligned. 560 newFileSize := (int64(fr.End) + chunkMask) &^ chunkMask 561 if err := f.file.Truncate(newFileSize); err != nil { 562 return memmap.FileRange{}, err 563 } 564 f.fileSize = newFileSize 565 f.mappingsMu.Lock() 566 oldMappings := f.mappings.Load().([]uintptr) 567 newMappings := make([]uintptr, newFileSize>>chunkShift) 568 copy(newMappings, oldMappings) 569 f.mappings.Store(newMappings) 570 f.mappingsMu.Unlock() 571 } 572 573 if f.opts.ManualZeroing { 574 if err := f.manuallyZero(fr); err != nil { 575 return memmap.FileRange{}, err 576 } 577 } 578 // Mark selected pages as in use. 579 if !f.usage.Add(fr, usageInfo{ 580 kind: opts.Kind, 581 refs: 1, 582 memCgID: opts.MemCgID, 583 }) { 584 panic(fmt.Sprintf("allocating %v: failed to insert into usage set:\n%v", fr, &f.usage)) 585 } 586 587 return fr, nil 588 } 589 590 // findAvailableRange returns an available range in the usageSet. 591 // 592 // Note that scanning for available slots takes place from end first backwards, 593 // then forwards. This heuristic has important consequence for how sequential 594 // mappings can be merged in the host VMAs, given that addresses for both 595 // application and sentry mappings are allocated top-down (from higher to 596 // lower addresses). The file is also grown exponentially in order to create 597 // space for mappings to be allocated downwards. 598 // 599 // Precondition: alignment must be a power of 2. 600 func (f *MemoryFile) findAvailableRange(length, alignment uint64, dir Direction) (memmap.FileRange, bool) { 601 if dir == BottomUp { 602 return findAvailableRangeBottomUp(&f.usage, length, alignment) 603 } 604 return findAvailableRangeTopDown(&f.usage, f.fileSize, length, alignment) 605 } 606 607 func findAvailableRangeTopDown(usage *usageSet, fileSize int64, length, alignment uint64) (memmap.FileRange, bool) { 608 alignmentMask := alignment - 1 609 610 // Search for space in existing gaps, starting at the current end of the 611 // file and working backward. 612 lastGap := usage.LastGap() 613 gap := lastGap 614 for { 615 end := gap.End() 616 if end > uint64(fileSize) { 617 end = uint64(fileSize) 618 } 619 620 // Try to allocate from the end of this gap, with the start of the 621 // allocated range aligned down to alignment. 622 unalignedStart := end - length 623 if unalignedStart > end { 624 // Negative overflow: this and all preceding gaps are too small to 625 // accommodate length. 626 break 627 } 628 if start := unalignedStart &^ alignmentMask; start >= gap.Start() { 629 return memmap.FileRange{start, start + length}, true 630 } 631 632 gap = gap.PrevLargeEnoughGap(length) 633 if !gap.Ok() { 634 break 635 } 636 } 637 638 // Check that it's possible to fit this allocation at the end of a file of any size. 639 min := lastGap.Start() 640 min = (min + alignmentMask) &^ alignmentMask 641 if min+length < min { 642 // Overflow: allocation would exceed the range of uint64. 643 return memmap.FileRange{}, false 644 } 645 646 // Determine the minimum file size required to fit this allocation at its end. 647 for { 648 newFileSize := 2 * fileSize 649 if newFileSize <= fileSize { 650 if fileSize != 0 { 651 // Overflow: allocation would exceed the range of int64. 652 return memmap.FileRange{}, false 653 } 654 newFileSize = chunkSize 655 } 656 fileSize = newFileSize 657 658 unalignedStart := uint64(fileSize) - length 659 if unalignedStart > uint64(fileSize) { 660 // Negative overflow: fileSize is still inadequate. 661 continue 662 } 663 if start := unalignedStart &^ alignmentMask; start >= min { 664 return memmap.FileRange{start, start + length}, true 665 } 666 } 667 } 668 669 func findAvailableRangeBottomUp(usage *usageSet, length, alignment uint64) (memmap.FileRange, bool) { 670 alignmentMask := alignment - 1 671 for gap := usage.FirstGap(); gap.Ok(); gap = gap.NextLargeEnoughGap(length) { 672 // Align the start address and check if allocation still fits in the gap. 673 start := (gap.Start() + alignmentMask) &^ alignmentMask 674 675 // File offsets are int64s. Since length must be strictly positive, end 676 // cannot legitimately be 0. 677 end := start + length 678 if end < start || int64(end) <= 0 { 679 return memmap.FileRange{}, false 680 } 681 if end <= gap.End() { 682 return memmap.FileRange{start, end}, true 683 } 684 } 685 686 // NextLargeEnoughGap should have returned a gap at the end. 687 panic(fmt.Sprintf("NextLargeEnoughGap didn't return a gap at the end, length: %d", length)) 688 } 689 690 var mlockDisabled atomicbitops.Uint32 691 var madvPopulateWriteDisabled atomicbitops.Uint32 692 693 func canPopulate() bool { 694 return mlockDisabled.Load() == 0 || madvPopulateWriteDisabled.Load() == 0 695 } 696 697 func tryPopulateMadv(b safemem.Block) bool { 698 if madvPopulateWriteDisabled.Load() != 0 { 699 return false 700 } 701 start, ok := hostarch.Addr(b.Addr()).RoundUp() 702 if !ok { 703 return true 704 } 705 end := hostarch.Addr(b.Addr() + uintptr(b.Len())).RoundDown() 706 bLen := end - start 707 // Only call madvise(MADV_POPULATE_WRITE) if >=2 pages are being populated. 708 // 1 syscall overhead >= 1 page fault overhead. This is because syscalls are 709 // susceptible to additional overheads like seccomp-bpf filters and auditing. 710 if start >= end || bLen <= hostarch.PageSize { 711 return true 712 } 713 _, _, errno := unix.RawSyscall(unix.SYS_MADVISE, uintptr(start), uintptr(bLen), unix.MADV_POPULATE_WRITE) 714 if errno != 0 { 715 if errno == unix.EINVAL { 716 // EINVAL is expected if MADV_POPULATE_WRITE is not supported (Linux <5.14). 717 log.Infof("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: madvise failed: %s", errno) 718 } else { 719 log.Warningf("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: madvise failed: %s", errno) 720 } 721 madvPopulateWriteDisabled.Store(1) 722 return false 723 } 724 return true 725 } 726 727 func tryPopulateMlock(b safemem.Block) bool { 728 if mlockDisabled.Load() != 0 { 729 return false 730 } 731 // Call mlock to populate pages, then munlock to cancel the mlock (but keep 732 // the pages populated). Only do so for hugepage-aligned address ranges to 733 // ensure that splitting the VMA in mlock doesn't split any existing 734 // hugepages. This assumes that two host syscalls, plus the MM overhead of 735 // mlock + munlock, is faster on average than trapping for 736 // HugePageSize/PageSize small page faults. 737 start, ok := hostarch.Addr(b.Addr()).HugeRoundUp() 738 if !ok { 739 return true 740 } 741 end := hostarch.Addr(b.Addr() + uintptr(b.Len())).HugeRoundDown() 742 if start >= end { 743 return true 744 } 745 _, _, errno := unix.Syscall(unix.SYS_MLOCK, uintptr(start), uintptr(end-start), 0) 746 unix.RawSyscall(unix.SYS_MUNLOCK, uintptr(start), uintptr(end-start), 0) 747 if errno != 0 { 748 if errno == unix.ENOMEM || errno == unix.EPERM { 749 // These errors are expected from hitting non-zero RLIMIT_MEMLOCK, or 750 // hitting zero RLIMIT_MEMLOCK without CAP_IPC_LOCK, respectively. 751 log.Infof("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: mlock failed: %s", errno) 752 } else { 753 log.Warningf("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: mlock failed: %s", errno) 754 } 755 mlockDisabled.Store(1) 756 return false 757 } 758 return true 759 } 760 761 func tryPopulate(b safemem.Block) bool { 762 // There are two approaches for populating writable pages: 763 // 1. madvise(MADV_POPULATE_WRITE). It has the desired effect: "Populate 764 // (prefault) page tables writable, faulting in all pages in the range 765 // just as if manually writing to each each page". 766 // 2. Call mlock to populate pages, then munlock to cancel the mlock (but 767 // keep the pages populated). 768 // 769 // Prefer the madvise(MADV_POPULATE_WRITE) approach because: 770 // - Only requires 1 syscall, as opposed to 2 syscalls with mlock approach. 771 // - It is faster because it doesn't have to modify vmas like mlock does. 772 // - It works for disk-backed memory mappings too. The mlock approach doesn't 773 // work for disk-backed filesystems (e.g. ext4). This is because 774 // mlock(2) => mm/gup.c:__mm_populate() emulates a read fault on writable 775 // MAP_SHARED mappings. For memory-backed (shmem) files, 776 // mm/mmap.c:vma_set_page_prot() => vma_wants_writenotify() is false, so 777 // the page table entries populated by a read fault are writable. For 778 // disk-backed files, vma_set_page_prot() => vma_wants_writenotify() is 779 // true, so the page table entries populated by a read fault are read-only. 780 if tryPopulateMadv(b) { 781 return true 782 } 783 return tryPopulateMlock(b) 784 } 785 786 // fallocate(2) modes, defined in Linux's include/uapi/linux/falloc.h. 787 const ( 788 _FALLOC_FL_KEEP_SIZE = 1 789 _FALLOC_FL_PUNCH_HOLE = 2 790 ) 791 792 // Decommit releases resources associated with maintaining the contents of the 793 // given pages. If Decommit succeeds, future accesses of the decommitted pages 794 // will read zeroes. 795 // 796 // Preconditions: fr.Length() > 0. 797 func (f *MemoryFile) Decommit(fr memmap.FileRange) error { 798 if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 { 799 panic(fmt.Sprintf("invalid range: %v", fr)) 800 } 801 802 if f.opts.ManualZeroing { 803 // FALLOC_FL_PUNCH_HOLE may not zero pages if ManualZeroing is in 804 // effect. 805 if err := f.manuallyZero(fr); err != nil { 806 return err 807 } 808 } else { 809 if err := f.decommitFile(fr); err != nil { 810 return err 811 } 812 } 813 814 f.markDecommitted(fr) 815 return nil 816 } 817 818 func (f *MemoryFile) manuallyZero(fr memmap.FileRange) error { 819 return f.forEachMappingSlice(fr, func(bs []byte) { 820 for i := range bs { 821 bs[i] = 0 822 } 823 }) 824 } 825 826 func (f *MemoryFile) commitFile(fr memmap.FileRange) error { 827 // "The default operation (i.e., mode is zero) of fallocate() allocates the 828 // disk space within the range specified by offset and len." - fallocate(2) 829 return unix.Fallocate( 830 int(f.file.Fd()), 831 0, // mode 832 int64(fr.Start), 833 int64(fr.Length())) 834 } 835 836 func (f *MemoryFile) decommitFile(fr memmap.FileRange) error { 837 // "After a successful call, subsequent reads from this range will 838 // return zeroes. The FALLOC_FL_PUNCH_HOLE flag must be ORed with 839 // FALLOC_FL_KEEP_SIZE in mode ..." - fallocate(2) 840 return unix.Fallocate( 841 int(f.file.Fd()), 842 _FALLOC_FL_PUNCH_HOLE|_FALLOC_FL_KEEP_SIZE, 843 int64(fr.Start), 844 int64(fr.Length())) 845 } 846 847 func (f *MemoryFile) markDecommitted(fr memmap.FileRange) { 848 f.mu.Lock() 849 defer f.mu.Unlock() 850 // Since we're changing the knownCommitted attribute, we need to merge 851 // across the entire range to ensure that the usage tree is minimal. 852 gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) { 853 val := seg.ValuePtr() 854 if val.knownCommitted { 855 // Drop the usageExpected appropriately. 856 amount := seg.Range().Length() 857 usage.MemoryAccounting.Dec(amount, val.kind, val.memCgID) 858 f.usageExpected -= amount 859 val.knownCommitted = false 860 } 861 val.memCgID = 0 862 }) 863 if gap.Ok() { 864 panic(fmt.Sprintf("Decommit(%v): attempted to decommit unallocated pages %v:\n%v", fr, gap.Range(), &f.usage)) 865 } 866 f.usage.MergeRange(fr) 867 } 868 869 // IncRef implements memmap.File.IncRef. 870 func (f *MemoryFile) IncRef(fr memmap.FileRange, memCgID uint32) { 871 if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 { 872 panic(fmt.Sprintf("invalid range: %v", fr)) 873 } 874 875 f.mu.Lock() 876 defer f.mu.Unlock() 877 878 gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) { 879 seg.ValuePtr().refs++ 880 }) 881 if gap.Ok() { 882 panic(fmt.Sprintf("IncRef(%v): attempted to IncRef on unallocated pages %v:\n%v", fr, gap.Range(), &f.usage)) 883 } 884 885 f.usage.MergeAdjacent(fr) 886 } 887 888 // DecRef implements memmap.File.DecRef. 889 func (f *MemoryFile) DecRef(fr memmap.FileRange) { 890 if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 { 891 panic(fmt.Sprintf("invalid range: %v", fr)) 892 } 893 894 var freed bool 895 896 f.mu.Lock() 897 defer f.mu.Unlock() 898 899 for seg := f.usage.FindSegment(fr.Start); seg.Ok() && seg.Start() < fr.End; seg = seg.NextSegment() { 900 seg = f.usage.Isolate(seg, fr) 901 val := seg.ValuePtr() 902 if val.refs == 0 { 903 panic(fmt.Sprintf("DecRef(%v): 0 existing references on %v:\n%v", fr, seg.Range(), &f.usage)) 904 } 905 val.refs-- 906 if val.refs == 0 { 907 f.reclaim.Add(seg.Range(), reclaimSetValue{}) 908 freed = true 909 // Reclassify memory as System, until it's freed by the reclaim 910 // goroutine. 911 if val.knownCommitted { 912 usage.MemoryAccounting.Move(seg.Range().Length(), usage.System, val.kind, val.memCgID) 913 } 914 val.kind = usage.System 915 } 916 } 917 f.usage.MergeAdjacent(fr) 918 919 if freed { 920 f.reclaimable = true 921 f.reclaimCond.Signal() 922 } 923 } 924 925 // MapInternal implements memmap.File.MapInternal. 926 func (f *MemoryFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { 927 if !fr.WellFormed() || fr.Length() == 0 { 928 panic(fmt.Sprintf("invalid range: %v", fr)) 929 } 930 if at.Execute { 931 return safemem.BlockSeq{}, linuxerr.EACCES 932 } 933 934 chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift) 935 if chunks == 1 { 936 // Avoid an unnecessary slice allocation. 937 var seq safemem.BlockSeq 938 err := f.forEachMappingSlice(fr, func(bs []byte) { 939 seq = safemem.BlockSeqOf(safemem.BlockFromSafeSlice(bs)) 940 }) 941 return seq, err 942 } 943 blocks := make([]safemem.Block, 0, chunks) 944 err := f.forEachMappingSlice(fr, func(bs []byte) { 945 blocks = append(blocks, safemem.BlockFromSafeSlice(bs)) 946 }) 947 return safemem.BlockSeqFromSlice(blocks), err 948 } 949 950 // forEachMappingSlice invokes fn on a sequence of byte slices that 951 // collectively map all bytes in fr. 952 func (f *MemoryFile) forEachMappingSlice(fr memmap.FileRange, fn func([]byte)) error { 953 mappings := f.mappings.Load().([]uintptr) 954 for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize { 955 chunk := int(chunkStart >> chunkShift) 956 m := atomic.LoadUintptr(&mappings[chunk]) 957 if m == 0 { 958 var err error 959 mappings, m, err = f.getChunkMapping(chunk) 960 if err != nil { 961 return err 962 } 963 } 964 startOff := uint64(0) 965 if chunkStart < fr.Start { 966 startOff = fr.Start - chunkStart 967 } 968 endOff := uint64(chunkSize) 969 if chunkStart+chunkSize > fr.End { 970 endOff = fr.End - chunkStart 971 } 972 fn(unsafeSlice(m, chunkSize)[startOff:endOff]) 973 } 974 return nil 975 } 976 977 func (f *MemoryFile) getChunkMapping(chunk int) ([]uintptr, uintptr, error) { 978 f.mappingsMu.Lock() 979 defer f.mappingsMu.Unlock() 980 // Another thread may have replaced f.mappings altogether due to file 981 // expansion. 982 mappings := f.mappings.Load().([]uintptr) 983 // Another thread may have already mapped the chunk. 984 if m := mappings[chunk]; m != 0 { 985 return mappings, m, nil 986 } 987 m, _, errno := unix.Syscall6( 988 unix.SYS_MMAP, 989 0, 990 chunkSize, 991 unix.PROT_READ|unix.PROT_WRITE, 992 unix.MAP_SHARED, 993 f.file.Fd(), 994 uintptr(chunk<<chunkShift)) 995 if errno != 0 { 996 return nil, 0, errno 997 } 998 atomic.StoreUintptr(&mappings[chunk], m) 999 return mappings, m, nil 1000 } 1001 1002 // MarkEvictable allows f to request memory deallocation by calling 1003 // user.Evict(er) in the future. 1004 // 1005 // Redundantly marking an already-evictable range as evictable has no effect. 1006 func (f *MemoryFile) MarkEvictable(user EvictableMemoryUser, er EvictableRange) { 1007 f.mu.Lock() 1008 defer f.mu.Unlock() 1009 info, ok := f.evictable[user] 1010 if !ok { 1011 info = &evictableMemoryUserInfo{} 1012 f.evictable[user] = info 1013 } 1014 gap := info.ranges.LowerBoundGap(er.Start) 1015 for gap.Ok() && gap.Start() < er.End { 1016 gapER := gap.Range().Intersect(er) 1017 if gapER.Length() == 0 { 1018 gap = gap.NextGap() 1019 continue 1020 } 1021 gap = info.ranges.Insert(gap, gapER, evictableRangeSetValue{}).NextGap() 1022 } 1023 if !info.evicting { 1024 switch f.opts.DelayedEviction { 1025 case DelayedEvictionDisabled: 1026 // Kick off eviction immediately. 1027 f.startEvictionGoroutineLocked(user, info) 1028 case DelayedEvictionEnabled: 1029 if !f.opts.UseHostMemcgPressure { 1030 // Ensure that the reclaimer goroutine is running, so that it 1031 // can start eviction when necessary. 1032 f.reclaimCond.Signal() 1033 } 1034 } 1035 } 1036 } 1037 1038 // MarkUnevictable informs f that user no longer considers er to be evictable, 1039 // so the MemoryFile should no longer call user.Evict(er). Note that, per 1040 // EvictableMemoryUser.Evict's documentation, user.Evict(er) may still be 1041 // called even after MarkUnevictable returns due to race conditions, and 1042 // implementations of EvictableMemoryUser must handle this possibility. 1043 // 1044 // Redundantly marking an already-unevictable range as unevictable has no 1045 // effect. 1046 func (f *MemoryFile) MarkUnevictable(user EvictableMemoryUser, er EvictableRange) { 1047 f.mu.Lock() 1048 defer f.mu.Unlock() 1049 info, ok := f.evictable[user] 1050 if !ok { 1051 return 1052 } 1053 seg := info.ranges.LowerBoundSegment(er.Start) 1054 for seg.Ok() && seg.Start() < er.End { 1055 seg = info.ranges.Isolate(seg, er) 1056 seg = info.ranges.Remove(seg).NextSegment() 1057 } 1058 // We can only remove info if there's no eviction goroutine running on its 1059 // behalf. 1060 if !info.evicting && info.ranges.IsEmpty() { 1061 delete(f.evictable, user) 1062 } 1063 } 1064 1065 // MarkAllUnevictable informs f that user no longer considers any offsets to be 1066 // evictable. It otherwise has the same semantics as MarkUnevictable. 1067 func (f *MemoryFile) MarkAllUnevictable(user EvictableMemoryUser) { 1068 f.mu.Lock() 1069 defer f.mu.Unlock() 1070 info, ok := f.evictable[user] 1071 if !ok { 1072 return 1073 } 1074 info.ranges.RemoveAll() 1075 // We can only remove info if there's no eviction goroutine running on its 1076 // behalf. 1077 if !info.evicting { 1078 delete(f.evictable, user) 1079 } 1080 } 1081 1082 // ShouldCacheEvictable returns true if f is meaningfully delaying evictions of 1083 // evictable memory, such that it may be advantageous to cache data in 1084 // evictable memory. The value returned by ShouldCacheEvictable may change 1085 // between calls. 1086 func (f *MemoryFile) ShouldCacheEvictable() bool { 1087 return f.opts.DelayedEviction == DelayedEvictionManual || f.opts.UseHostMemcgPressure 1088 } 1089 1090 // UpdateUsage ensures that the memory usage statistics in 1091 // usage.MemoryAccounting are up to date. 1092 func (f *MemoryFile) UpdateUsage() error { 1093 f.mu.Lock() 1094 defer f.mu.Unlock() 1095 1096 // If the underlying usage matches where the usage tree already 1097 // represents, then we can just avoid the entire scan (we know it's 1098 // accurate). 1099 currentUsage, err := f.TotalUsage() 1100 if err != nil { 1101 return err 1102 } 1103 if currentUsage == f.usageExpected && f.usageSwapped == 0 { 1104 log.Debugf("UpdateUsage: skipped with usageSwapped=0.") 1105 return nil 1106 } 1107 // If the current usage matches the expected but there's swap 1108 // accounting, then ensure a scan takes place at least every second 1109 // (when requested). 1110 if currentUsage == f.usageExpected+f.usageSwapped && time.Now().Before(f.usageLast.Add(time.Second)) { 1111 log.Debugf("UpdateUsage: skipped with usageSwapped!=0.") 1112 return nil 1113 } 1114 // Linux updates usage values at CONFIG_HZ. 1115 if scanningAfter := time.Now().Sub(f.usageLast).Milliseconds(); scanningAfter < time.Second.Milliseconds()/linux.CLOCKS_PER_SEC { 1116 log.Debugf("UpdateUsage: skipped because previous scan happened %d ms back", scanningAfter) 1117 return nil 1118 } 1119 1120 f.usageLast = time.Now() 1121 err = f.updateUsageLocked(currentUsage, mincore) 1122 log.Debugf("UpdateUsage: currentUsage=%d, usageExpected=%d, usageSwapped=%d.", 1123 currentUsage, f.usageExpected, f.usageSwapped) 1124 log.Debugf("UpdateUsage: took %v.", time.Since(f.usageLast)) 1125 return err 1126 } 1127 1128 // updateUsageLocked attempts to detect commitment of previous-uncommitted 1129 // pages by invoking checkCommitted, which is a function that, for each page i 1130 // in bs, sets committed[i] to 1 if the page is committed and 0 otherwise. 1131 // 1132 // Precondition: f.mu must be held; it may be unlocked and reacquired. 1133 // +checklocks:f.mu 1134 func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(bs []byte, committed []byte) error) error { 1135 // Track if anything changed to elide the merge. In the common case, we 1136 // expect all segments to be committed and no merge to occur. 1137 changedAny := false 1138 defer func() { 1139 if changedAny { 1140 f.usage.MergeAll() 1141 } 1142 1143 // Adjust the swap usage to reflect reality. 1144 if f.usageExpected < currentUsage { 1145 // Since no pages may be marked decommitted while we hold mu, we 1146 // know that usage may have only increased since we got the last 1147 // current usage. Therefore, if usageExpected is still short of 1148 // currentUsage, we must assume that the difference is in pages 1149 // that have been swapped. 1150 newUsageSwapped := currentUsage - f.usageExpected 1151 if f.usageSwapped < newUsageSwapped { 1152 usage.MemoryAccounting.Inc(newUsageSwapped-f.usageSwapped, usage.System, 0) 1153 } else { 1154 usage.MemoryAccounting.Dec(f.usageSwapped-newUsageSwapped, usage.System, 0) 1155 } 1156 f.usageSwapped = newUsageSwapped 1157 } else if f.usageSwapped != 0 { 1158 // We have more usage accounted for than the file itself. 1159 // That's fine, we probably caught a race where pages were 1160 // being committed while the below loop was running. Just 1161 // report the higher number that we found and ignore swap. 1162 usage.MemoryAccounting.Dec(f.usageSwapped, usage.System, 0) 1163 f.usageSwapped = 0 1164 } 1165 }() 1166 1167 // Reused mincore buffer, will generally be <= 4096 bytes. 1168 var buf []byte 1169 1170 // Iterate over all usage data. There will only be usage segments 1171 // present when there is an associated reference. 1172 for seg := f.usage.FirstSegment(); seg.Ok(); { 1173 if !seg.ValuePtr().canCommit() { 1174 seg = seg.NextSegment() 1175 continue 1176 } 1177 1178 // Get the range for this segment. As we touch slices, the 1179 // Start value will be walked along. 1180 r := seg.Range() 1181 1182 var checkErr error 1183 err := f.forEachMappingSlice(r, 1184 func(s []byte) { 1185 if checkErr != nil { 1186 return 1187 } 1188 1189 // Ensure that we have sufficient buffer for the call 1190 // (one byte per page). The length of each slice must 1191 // be page-aligned. 1192 bufLen := len(s) / hostarch.PageSize 1193 if len(buf) < bufLen { 1194 buf = make([]byte, bufLen) 1195 } 1196 1197 // Query for new pages in core. 1198 // NOTE(b/165896008): mincore (which is passed as checkCommitted) 1199 // by f.UpdateUsage() might take a really long time. So unlock f.mu 1200 // while checkCommitted runs. 1201 f.mu.Unlock() // +checklocksforce 1202 err := checkCommitted(s, buf) 1203 f.mu.Lock() 1204 if err != nil { 1205 checkErr = err 1206 return 1207 } 1208 1209 // Scan each page and switch out segments. 1210 seg := f.usage.LowerBoundSegment(r.Start) 1211 for i := 0; i < bufLen; { 1212 if buf[i]&0x1 == 0 { 1213 i++ 1214 continue 1215 } 1216 // Scan to the end of this committed range. 1217 j := i + 1 1218 for ; j < bufLen; j++ { 1219 if buf[j]&0x1 == 0 { 1220 break 1221 } 1222 } 1223 committedFR := memmap.FileRange{ 1224 Start: r.Start + uint64(i*hostarch.PageSize), 1225 End: r.Start + uint64(j*hostarch.PageSize), 1226 } 1227 // Advance seg to committedFR.Start. 1228 for seg.Ok() && seg.End() < committedFR.Start { 1229 seg = seg.NextSegment() 1230 } 1231 // Mark pages overlapping committedFR as committed. 1232 for seg.Ok() && seg.Start() < committedFR.End { 1233 if seg.ValuePtr().canCommit() { 1234 seg = f.usage.Isolate(seg, committedFR) 1235 seg.ValuePtr().knownCommitted = true 1236 amount := seg.Range().Length() 1237 usage.MemoryAccounting.Inc(amount, seg.ValuePtr().kind, seg.ValuePtr().memCgID) 1238 f.usageExpected += amount 1239 changedAny = true 1240 } 1241 seg = seg.NextSegment() 1242 } 1243 // Continue scanning for committed pages. 1244 i = j + 1 1245 } 1246 1247 // Advance r.Start. 1248 r.Start += uint64(len(s)) 1249 }) 1250 if checkErr != nil { 1251 return checkErr 1252 } 1253 if err != nil { 1254 return err 1255 } 1256 1257 // Continue with the first segment after r.End. 1258 seg = f.usage.LowerBoundSegment(r.End) 1259 } 1260 1261 return nil 1262 } 1263 1264 // TotalUsage returns an aggregate usage for all memory statistics except 1265 // Mapped (which is external to MemoryFile). This is generally much cheaper 1266 // than UpdateUsage, but will not provide a fine-grained breakdown. 1267 func (f *MemoryFile) TotalUsage() (uint64, error) { 1268 // Stat the underlying file to discover the underlying usage. stat(2) 1269 // always reports the allocated block count in units of 512 bytes. This 1270 // includes pages in the page cache and swapped pages. 1271 var stat unix.Stat_t 1272 if err := unix.Fstat(int(f.file.Fd()), &stat); err != nil { 1273 return 0, err 1274 } 1275 return uint64(stat.Blocks * 512), nil 1276 } 1277 1278 // TotalSize returns the current size of the backing file in bytes, which is an 1279 // upper bound on the amount of memory that can currently be allocated from the 1280 // MemoryFile. The value returned by TotalSize is permitted to change. 1281 func (f *MemoryFile) TotalSize() uint64 { 1282 f.mu.Lock() 1283 defer f.mu.Unlock() 1284 return uint64(f.fileSize) 1285 } 1286 1287 // File returns the backing file. 1288 func (f *MemoryFile) File() *os.File { 1289 return f.file 1290 } 1291 1292 // FD implements memmap.File.FD. 1293 func (f *MemoryFile) FD() int { 1294 return int(f.file.Fd()) 1295 } 1296 1297 // IsDiskBacked returns true if f is backed by a file on disk. 1298 func (f *MemoryFile) IsDiskBacked() bool { 1299 return f.opts.DiskBackedFile 1300 } 1301 1302 // String implements fmt.Stringer.String. 1303 // 1304 // Note that because f.String locks f.mu, calling f.String internally 1305 // (including indirectly through the fmt package) risks recursive locking. 1306 // Within the pgalloc package, use f.usage directly instead. 1307 func (f *MemoryFile) String() string { 1308 f.mu.Lock() 1309 defer f.mu.Unlock() 1310 return f.usage.String() 1311 } 1312 1313 // runReclaim implements the reclaimer goroutine, which continuously decommits 1314 // reclaimable pages in order to reduce memory usage and make them available 1315 // for allocation. 1316 func (f *MemoryFile) runReclaim() { 1317 for { 1318 // N.B. We must call f.markReclaimed on the returned FrameRange. 1319 fr, ok := f.findReclaimable() 1320 if !ok { 1321 break 1322 } 1323 1324 if f.opts.ManualZeroing { 1325 // If ManualZeroing is in effect, only hugepage-aligned regions may 1326 // be safely passed to decommitFile. Pages will be zeroed on 1327 // reallocation, so we don't need to perform any manual zeroing 1328 // here, whether or not decommitFile succeeds. 1329 if startAddr, ok := hostarch.Addr(fr.Start).HugeRoundUp(); ok { 1330 if endAddr := hostarch.Addr(fr.End).HugeRoundDown(); startAddr < endAddr { 1331 decommitFR := memmap.FileRange{uint64(startAddr), uint64(endAddr)} 1332 if err := f.decommitFile(decommitFR); err != nil { 1333 log.Warningf("Reclaim failed to decommit %v: %v", decommitFR, err) 1334 } 1335 } 1336 } 1337 } else { 1338 if err := f.decommitFile(fr); err != nil { 1339 log.Warningf("Reclaim failed to decommit %v: %v", fr, err) 1340 // Zero the pages manually. This won't reduce memory usage, but at 1341 // least ensures that the pages will be zero when reallocated. 1342 if err := f.manuallyZero(fr); err != nil { 1343 panic(fmt.Sprintf("Reclaim failed to decommit or zero %v: %v", fr, err)) 1344 } 1345 } 1346 } 1347 f.markDecommitted(fr) 1348 f.markReclaimed(fr) 1349 } 1350 1351 // We only get here if findReclaimable finds f.destroyed set and returns 1352 // false. 1353 f.mu.Lock() 1354 if !f.destroyed { 1355 f.mu.Unlock() 1356 panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set") 1357 } 1358 if f.opts.DecommitOnDestroy && f.fileSize > 0 { 1359 if err := f.decommitFile(memmap.FileRange{Start: 0, End: uint64(f.fileSize)}); err != nil { 1360 f.mu.Unlock() 1361 panic(fmt.Sprintf("failed to decommit entire memory file during destruction: %v", err)) 1362 } 1363 } 1364 f.file.Close() 1365 // Ensure that any attempts to use f.file.Fd() fail instead of getting a fd 1366 // that has possibly been reassigned. 1367 f.file = nil 1368 f.mappingsMu.Lock() 1369 defer f.mappingsMu.Unlock() 1370 mappings := f.mappings.Load().([]uintptr) 1371 for i, m := range mappings { 1372 if m != 0 { 1373 _, _, errno := unix.Syscall(unix.SYS_MUNMAP, m, chunkSize, 0) 1374 if errno != 0 { 1375 log.Warningf("Failed to unmap mapping %#x for MemoryFile chunk %d: %v", m, i, errno) 1376 } 1377 } 1378 } 1379 // Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.) 1380 f.mappings.Store([]uintptr{}) 1381 f.mu.Unlock() 1382 1383 // This must be called without holding f.mu to avoid circular lock 1384 // ordering. 1385 if f.stopNotifyPressure != nil { 1386 f.stopNotifyPressure() 1387 } 1388 } 1389 1390 // findReclaimable finds memory that has been marked for reclaim. 1391 // 1392 // Note that there returned range will be removed from tracking. It 1393 // must be reclaimed (removed from f.usage) at this point. 1394 func (f *MemoryFile) findReclaimable() (memmap.FileRange, bool) { 1395 f.mu.Lock() 1396 defer f.mu.Unlock() 1397 for { 1398 for { 1399 if f.destroyed { 1400 return memmap.FileRange{}, false 1401 } 1402 if f.reclaimable { 1403 break 1404 } 1405 if f.opts.DelayedEviction == DelayedEvictionEnabled && !f.opts.UseHostMemcgPressure { 1406 // No work to do. Evict any pending evictable allocations to 1407 // get more reclaimable pages before going to sleep. 1408 f.startEvictionsLocked() 1409 } 1410 f.reclaimCond.Wait() 1411 } 1412 // Most allocations are done upwards, with exceptions being stacks and some 1413 // allocators that allocate top-down. Reclaim preserves this order to 1414 // minimize the cost of the search. 1415 if seg := f.reclaim.FirstSegment(); seg.Ok() { 1416 fr := seg.Range() 1417 f.reclaim.Remove(seg) 1418 return fr, true 1419 } 1420 // Nothing is reclaimable. 1421 f.reclaimable = false 1422 } 1423 } 1424 1425 func (f *MemoryFile) markReclaimed(fr memmap.FileRange) { 1426 f.mu.Lock() 1427 defer f.mu.Unlock() 1428 seg := f.usage.FindSegment(fr.Start) 1429 // All of fr should be mapped to a single uncommitted reclaimable 1430 // segment accounted to System. 1431 if !seg.Ok() { 1432 panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage)) 1433 } 1434 if !seg.Range().IsSupersetOf(fr) { 1435 panic(fmt.Sprintf("reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage)) 1436 } 1437 if got, want := seg.Value(), (usageInfo{ 1438 kind: usage.System, 1439 knownCommitted: false, 1440 refs: 0, 1441 memCgID: 0, 1442 }); got != want { 1443 panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage)) 1444 } 1445 // Deallocate reclaimed pages. Even though all of seg is reclaimable, 1446 // the caller of markReclaimed may not have decommitted it, so we can 1447 // only mark fr as reclaimed. 1448 f.usage.Remove(f.usage.Isolate(seg, fr)) 1449 } 1450 1451 // StartEvictions requests that f evict all evictable allocations. It does not 1452 // wait for eviction to complete; for this, see MemoryFile.WaitForEvictions. 1453 func (f *MemoryFile) StartEvictions() { 1454 f.mu.Lock() 1455 defer f.mu.Unlock() 1456 f.startEvictionsLocked() 1457 } 1458 1459 // Preconditions: f.mu must be locked. 1460 func (f *MemoryFile) startEvictionsLocked() bool { 1461 startedAny := false 1462 for user, info := range f.evictable { 1463 // Don't start multiple goroutines to evict the same user's 1464 // allocations. 1465 if !info.evicting { 1466 f.startEvictionGoroutineLocked(user, info) 1467 startedAny = true 1468 } 1469 } 1470 return startedAny 1471 } 1472 1473 // Preconditions: 1474 // - info == f.evictable[user]. 1475 // - !info.evicting. 1476 // - f.mu must be locked. 1477 func (f *MemoryFile) startEvictionGoroutineLocked(user EvictableMemoryUser, info *evictableMemoryUserInfo) { 1478 info.evicting = true 1479 f.evictionWG.Add(1) 1480 go func() { // S/R-SAFE: f.evictionWG 1481 defer f.evictionWG.Done() 1482 for { 1483 f.mu.Lock() 1484 info, ok := f.evictable[user] 1485 if !ok { 1486 // This shouldn't happen: only this goroutine is permitted 1487 // to delete this entry. 1488 f.mu.Unlock() 1489 panic(fmt.Sprintf("evictableMemoryUserInfo for EvictableMemoryUser %v deleted while eviction goroutine running", user)) 1490 } 1491 if info.ranges.IsEmpty() { 1492 delete(f.evictable, user) 1493 f.mu.Unlock() 1494 return 1495 } 1496 // Evict from the end of info.ranges, under the assumption that 1497 // if ranges in user start being used again (and are 1498 // consequently marked unevictable), such uses are more likely 1499 // to start from the beginning of user. 1500 seg := info.ranges.LastSegment() 1501 er := seg.Range() 1502 info.ranges.Remove(seg) 1503 // user.Evict() must be called without holding f.mu to avoid 1504 // circular lock ordering. 1505 f.mu.Unlock() 1506 user.Evict(context.Background(), er) 1507 } 1508 }() 1509 } 1510 1511 // WaitForEvictions blocks until f is no longer evicting any evictable 1512 // allocations. 1513 func (f *MemoryFile) WaitForEvictions() { 1514 f.evictionWG.Wait() 1515 } 1516 1517 type usageSetFunctions struct{} 1518 1519 func (usageSetFunctions) MinKey() uint64 { 1520 return 0 1521 } 1522 1523 func (usageSetFunctions) MaxKey() uint64 { 1524 return math.MaxUint64 1525 } 1526 1527 func (usageSetFunctions) ClearValue(val *usageInfo) { 1528 } 1529 1530 func (usageSetFunctions) Merge(_ memmap.FileRange, val1 usageInfo, _ memmap.FileRange, val2 usageInfo) (usageInfo, bool) { 1531 return val1, val1 == val2 1532 } 1533 1534 func (usageSetFunctions) Split(_ memmap.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) { 1535 return val, val 1536 } 1537 1538 // evictableRangeSetValue is the value type of evictableRangeSet. 1539 type evictableRangeSetValue struct{} 1540 1541 type evictableRangeSetFunctions struct{} 1542 1543 func (evictableRangeSetFunctions) MinKey() uint64 { 1544 return 0 1545 } 1546 1547 func (evictableRangeSetFunctions) MaxKey() uint64 { 1548 return math.MaxUint64 1549 } 1550 1551 func (evictableRangeSetFunctions) ClearValue(val *evictableRangeSetValue) { 1552 } 1553 1554 func (evictableRangeSetFunctions) Merge(_ EvictableRange, _ evictableRangeSetValue, _ EvictableRange, _ evictableRangeSetValue) (evictableRangeSetValue, bool) { 1555 return evictableRangeSetValue{}, true 1556 } 1557 1558 func (evictableRangeSetFunctions) Split(_ EvictableRange, _ evictableRangeSetValue, _ uint64) (evictableRangeSetValue, evictableRangeSetValue) { 1559 return evictableRangeSetValue{}, evictableRangeSetValue{} 1560 } 1561 1562 // reclaimSetValue is the value type of reclaimSet. 1563 type reclaimSetValue struct{} 1564 1565 type reclaimSetFunctions struct{} 1566 1567 func (reclaimSetFunctions) MinKey() uint64 { 1568 return 0 1569 } 1570 1571 func (reclaimSetFunctions) MaxKey() uint64 { 1572 return math.MaxUint64 1573 } 1574 1575 func (reclaimSetFunctions) ClearValue(val *reclaimSetValue) { 1576 } 1577 1578 func (reclaimSetFunctions) Merge(_ memmap.FileRange, _ reclaimSetValue, _ memmap.FileRange, _ reclaimSetValue) (reclaimSetValue, bool) { 1579 return reclaimSetValue{}, true 1580 } 1581 1582 func (reclaimSetFunctions) Split(_ memmap.FileRange, _ reclaimSetValue, _ uint64) (reclaimSetValue, reclaimSetValue) { 1583 return reclaimSetValue{}, reclaimSetValue{} 1584 }