github.com/AESNooper/go/src@v0.0.0-20220218095104-b56a4ab1bbbb/runtime/mgcscavenge.go (about) 1 // Copyright 2019 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Scavenging free pages. 6 // 7 // This file implements scavenging (the release of physical pages backing mapped 8 // memory) of free and unused pages in the heap as a way to deal with page-level 9 // fragmentation and reduce the RSS of Go applications. 10 // 11 // Scavenging in Go happens on two fronts: there's the background 12 // (asynchronous) scavenger and the heap-growth (synchronous) scavenger. 13 // 14 // The former happens on a goroutine much like the background sweeper which is 15 // soft-capped at using scavengePercent of the mutator's time, based on 16 // order-of-magnitude estimates of the costs of scavenging. The background 17 // scavenger's primary goal is to bring the estimated heap RSS of the 18 // application down to a goal. 19 // 20 // That goal is defined as: 21 // (retainExtraPercent+100) / 100 * (heapGoal / lastHeapGoal) * last_heap_inuse 22 // 23 // Essentially, we wish to have the application's RSS track the heap goal, but 24 // the heap goal is defined in terms of bytes of objects, rather than pages like 25 // RSS. As a result, we need to take into account for fragmentation internal to 26 // spans. heapGoal / lastHeapGoal defines the ratio between the current heap goal 27 // and the last heap goal, which tells us by how much the heap is growing and 28 // shrinking. We estimate what the heap will grow to in terms of pages by taking 29 // this ratio and multiplying it by heap_inuse at the end of the last GC, which 30 // allows us to account for this additional fragmentation. Note that this 31 // procedure makes the assumption that the degree of fragmentation won't change 32 // dramatically over the next GC cycle. Overestimating the amount of 33 // fragmentation simply results in higher memory use, which will be accounted 34 // for by the next pacing up date. Underestimating the fragmentation however 35 // could lead to performance degradation. Handling this case is not within the 36 // scope of the scavenger. Situations where the amount of fragmentation balloons 37 // over the course of a single GC cycle should be considered pathologies, 38 // flagged as bugs, and fixed appropriately. 39 // 40 // An additional factor of retainExtraPercent is added as a buffer to help ensure 41 // that there's more unscavenged memory to allocate out of, since each allocation 42 // out of scavenged memory incurs a potentially expensive page fault. 43 // 44 // The goal is updated after each GC and the scavenger's pacing parameters 45 // (which live in mheap_) are updated to match. The pacing parameters work much 46 // like the background sweeping parameters. The parameters define a line whose 47 // horizontal axis is time and vertical axis is estimated heap RSS, and the 48 // scavenger attempts to stay below that line at all times. 49 // 50 // The synchronous heap-growth scavenging happens whenever the heap grows in 51 // size, for some definition of heap-growth. The intuition behind this is that 52 // the application had to grow the heap because existing fragments were 53 // not sufficiently large to satisfy a page-level memory allocation, so we 54 // scavenge those fragments eagerly to offset the growth in RSS that results. 55 56 package runtime 57 58 import ( 59 "internal/goos" 60 "runtime/internal/atomic" 61 "runtime/internal/sys" 62 "unsafe" 63 ) 64 65 const ( 66 // The background scavenger is paced according to these parameters. 67 // 68 // scavengePercent represents the portion of mutator time we're willing 69 // to spend on scavenging in percent. 70 scavengePercent = 1 // 1% 71 72 // retainExtraPercent represents the amount of memory over the heap goal 73 // that the scavenger should keep as a buffer space for the allocator. 74 // 75 // The purpose of maintaining this overhead is to have a greater pool of 76 // unscavenged memory available for allocation (since using scavenged memory 77 // incurs an additional cost), to account for heap fragmentation and 78 // the ever-changing layout of the heap. 79 retainExtraPercent = 10 80 81 // maxPagesPerPhysPage is the maximum number of supported runtime pages per 82 // physical page, based on maxPhysPageSize. 83 maxPagesPerPhysPage = maxPhysPageSize / pageSize 84 85 // scavengeCostRatio is the approximate ratio between the costs of using previously 86 // scavenged memory and scavenging memory. 87 // 88 // For most systems the cost of scavenging greatly outweighs the costs 89 // associated with using scavenged memory, making this constant 0. On other systems 90 // (especially ones where "sysUsed" is not just a no-op) this cost is non-trivial. 91 // 92 // This ratio is used as part of multiplicative factor to help the scavenger account 93 // for the additional costs of using scavenged memory in its pacing. 94 scavengeCostRatio = 0.7 * (goos.IsDarwin + goos.IsIos) 95 96 // scavengeReservationShards determines the amount of memory the scavenger 97 // should reserve for scavenging at a time. Specifically, the amount of 98 // memory reserved is (heap size in bytes) / scavengeReservationShards. 99 scavengeReservationShards = 64 100 ) 101 102 // heapRetained returns an estimate of the current heap RSS. 103 func heapRetained() uint64 { 104 return memstats.heap_sys.load() - atomic.Load64(&memstats.heap_released) 105 } 106 107 // gcPaceScavenger updates the scavenger's pacing, particularly 108 // its rate and RSS goal. For this, it requires the current heapGoal, 109 // and the heapGoal for the previous GC cycle. 110 // 111 // The RSS goal is based on the current heap goal with a small overhead 112 // to accommodate non-determinism in the allocator. 113 // 114 // The pacing is based on scavengePageRate, which applies to both regular and 115 // huge pages. See that constant for more information. 116 // 117 // Must be called whenever GC pacing is updated. 118 // 119 // mheap_.lock must be held or the world must be stopped. 120 func gcPaceScavenger(heapGoal, lastHeapGoal uint64) { 121 assertWorldStoppedOrLockHeld(&mheap_.lock) 122 123 // If we're called before the first GC completed, disable scavenging. 124 // We never scavenge before the 2nd GC cycle anyway (we don't have enough 125 // information about the heap yet) so this is fine, and avoids a fault 126 // or garbage data later. 127 if lastHeapGoal == 0 { 128 atomic.Store64(&mheap_.scavengeGoal, ^uint64(0)) 129 return 130 } 131 // Compute our scavenging goal. 132 goalRatio := float64(heapGoal) / float64(lastHeapGoal) 133 retainedGoal := uint64(float64(memstats.last_heap_inuse) * goalRatio) 134 // Add retainExtraPercent overhead to retainedGoal. This calculation 135 // looks strange but the purpose is to arrive at an integer division 136 // (e.g. if retainExtraPercent = 12.5, then we get a divisor of 8) 137 // that also avoids the overflow from a multiplication. 138 retainedGoal += retainedGoal / (1.0 / (retainExtraPercent / 100.0)) 139 // Align it to a physical page boundary to make the following calculations 140 // a bit more exact. 141 retainedGoal = (retainedGoal + uint64(physPageSize) - 1) &^ (uint64(physPageSize) - 1) 142 143 // Represents where we are now in the heap's contribution to RSS in bytes. 144 // 145 // Guaranteed to always be a multiple of physPageSize on systems where 146 // physPageSize <= pageSize since we map heap_sys at a rate larger than 147 // any physPageSize and released memory in multiples of the physPageSize. 148 // 149 // However, certain functions recategorize heap_sys as other stats (e.g. 150 // stack_sys) and this happens in multiples of pageSize, so on systems 151 // where physPageSize > pageSize the calculations below will not be exact. 152 // Generally this is OK since we'll be off by at most one regular 153 // physical page. 154 retainedNow := heapRetained() 155 156 // If we're already below our goal, or within one page of our goal, then disable 157 // the background scavenger. We disable the background scavenger if there's 158 // less than one physical page of work to do because it's not worth it. 159 if retainedNow <= retainedGoal || retainedNow-retainedGoal < uint64(physPageSize) { 160 atomic.Store64(&mheap_.scavengeGoal, ^uint64(0)) 161 return 162 } 163 atomic.Store64(&mheap_.scavengeGoal, retainedGoal) 164 } 165 166 // Sleep/wait state of the background scavenger. 167 var scavenge struct { 168 lock mutex 169 g *g 170 parked bool 171 timer *timer 172 sysmonWake uint32 // Set atomically. 173 } 174 175 // readyForScavenger signals sysmon to wake the scavenger because 176 // there may be new work to do. 177 // 178 // There may be a significant delay between when this function runs 179 // and when the scavenger is kicked awake, but it may be safely invoked 180 // in contexts where wakeScavenger is unsafe to call directly. 181 func readyForScavenger() { 182 atomic.Store(&scavenge.sysmonWake, 1) 183 } 184 185 // wakeScavenger immediately unparks the scavenger if necessary. 186 // 187 // May run without a P, but it may allocate, so it must not be called 188 // on any allocation path. 189 // 190 // mheap_.lock, scavenge.lock, and sched.lock must not be held. 191 func wakeScavenger() { 192 lock(&scavenge.lock) 193 if scavenge.parked { 194 // Notify sysmon that it shouldn't bother waking up the scavenger. 195 atomic.Store(&scavenge.sysmonWake, 0) 196 197 // Try to stop the timer but we don't really care if we succeed. 198 // It's possible that either a timer was never started, or that 199 // we're racing with it. 200 // In the case that we're racing with there's the low chance that 201 // we experience a spurious wake-up of the scavenger, but that's 202 // totally safe. 203 stopTimer(scavenge.timer) 204 205 // Unpark the goroutine and tell it that there may have been a pacing 206 // change. Note that we skip the scheduler's runnext slot because we 207 // want to avoid having the scavenger interfere with the fair 208 // scheduling of user goroutines. In effect, this schedules the 209 // scavenger at a "lower priority" but that's OK because it'll 210 // catch up on the work it missed when it does get scheduled. 211 scavenge.parked = false 212 213 // Ready the goroutine by injecting it. We use injectglist instead 214 // of ready or goready in order to allow us to run this function 215 // without a P. injectglist also avoids placing the goroutine in 216 // the current P's runnext slot, which is desirable to prevent 217 // the scavenger from interfering with user goroutine scheduling 218 // too much. 219 var list gList 220 list.push(scavenge.g) 221 injectglist(&list) 222 } 223 unlock(&scavenge.lock) 224 } 225 226 // scavengeSleep attempts to put the scavenger to sleep for ns. 227 // 228 // Note that this function should only be called by the scavenger. 229 // 230 // The scavenger may be woken up earlier by a pacing change, and it may not go 231 // to sleep at all if there's a pending pacing change. 232 // 233 // Returns the amount of time actually slept. 234 func scavengeSleep(ns int64) int64 { 235 lock(&scavenge.lock) 236 237 // Set the timer. 238 // 239 // This must happen here instead of inside gopark 240 // because we can't close over any variables without 241 // failing escape analysis. 242 start := nanotime() 243 resetTimer(scavenge.timer, start+ns) 244 245 // Mark ourself as asleep and go to sleep. 246 scavenge.parked = true 247 goparkunlock(&scavenge.lock, waitReasonSleep, traceEvGoSleep, 2) 248 249 // Return how long we actually slept for. 250 return nanotime() - start 251 } 252 253 // Background scavenger. 254 // 255 // The background scavenger maintains the RSS of the application below 256 // the line described by the proportional scavenging statistics in 257 // the mheap struct. 258 func bgscavenge(c chan int) { 259 scavenge.g = getg() 260 261 lockInit(&scavenge.lock, lockRankScavenge) 262 lock(&scavenge.lock) 263 scavenge.parked = true 264 265 scavenge.timer = new(timer) 266 scavenge.timer.f = func(_ interface{}, _ uintptr) { 267 wakeScavenger() 268 } 269 270 c <- 1 271 goparkunlock(&scavenge.lock, waitReasonGCScavengeWait, traceEvGoBlock, 1) 272 273 // idealFraction is the ideal % of overall application CPU time that we 274 // spend scavenging. 275 idealFraction := float64(scavengePercent) / 100.0 276 277 // Input: fraction of CPU time used. 278 // Setpoint: idealFraction. 279 // Output: ratio of critical time to sleep time (determines sleep time). 280 // 281 // The output of this controller is somewhat indirect to what we actually 282 // want to achieve: how much time to sleep for. The reason for this definition 283 // is to ensure that the controller's outputs have a direct relationship with 284 // its inputs (as opposed to an inverse relationship), making it somewhat 285 // easier to reason about for tuning purposes. 286 critSleepController := piController{ 287 // Tuned loosely via Ziegler-Nichols process. 288 kp: 0.3375, 289 ti: 3.2e6, 290 tt: 1e9, // 1 second reset time. 291 292 // These ranges seem wide, but we want to give the controller plenty of 293 // room to hunt for the optimal value. 294 min: 0.001, // 1:1000 295 max: 1000.0, // 1000:1 296 } 297 // It doesn't really matter what value we start at, but we can't be zero, because 298 // that'll cause divide-by-zero issues. 299 critSleepRatio := 0.001 300 for { 301 released := uintptr(0) 302 crit := float64(0) 303 304 // Spend at least 1 ms scavenging, otherwise the corresponding 305 // sleep time to maintain our desired utilization is too low to 306 // be reliable. 307 const minCritTime = 1e6 308 for crit < minCritTime { 309 // If background scavenging is disabled or if there's no work to do just park. 310 retained, goal := heapRetained(), atomic.Load64(&mheap_.scavengeGoal) 311 if retained <= goal { 312 break 313 } 314 315 // scavengeQuantum is the amount of memory we try to scavenge 316 // in one go. A smaller value means the scavenger is more responsive 317 // to the scheduler in case of e.g. preemption. A larger value means 318 // that the overheads of scavenging are better amortized, so better 319 // scavenging throughput. 320 // 321 // The current value is chosen assuming a cost of ~10µs/physical page 322 // (this is somewhat pessimistic), which implies a worst-case latency of 323 // about 160µs for 4 KiB physical pages. The current value is biased 324 // toward latency over throughput. 325 const scavengeQuantum = 64 << 10 326 327 // Accumulate the amount of time spent scavenging. 328 start := nanotime() 329 r := mheap_.pages.scavenge(scavengeQuantum) 330 atomic.Xadduintptr(&mheap_.pages.scav.released, r) 331 end := nanotime() 332 333 // On some platforms we may see end >= start if the time it takes to scavenge 334 // memory is less than the minimum granularity of its clock (e.g. Windows) or 335 // due to clock bugs. 336 // 337 // In this case, just assume scavenging takes 10 µs per regular physical page 338 // (determined empirically), and conservatively ignore the impact of huge pages 339 // on timing. 340 const approxCritNSPerPhysicalPage = 10e3 341 if end <= start { 342 crit += approxCritNSPerPhysicalPage * float64(r/physPageSize) 343 } else { 344 crit += float64(end - start) 345 } 346 released += r 347 348 // When using fake time just do one loop. 349 if faketime != 0 { 350 break 351 } 352 } 353 354 if released == 0 { 355 lock(&scavenge.lock) 356 scavenge.parked = true 357 goparkunlock(&scavenge.lock, waitReasonGCScavengeWait, traceEvGoBlock, 1) 358 continue 359 } 360 361 if released < physPageSize { 362 // If this happens, it means that we may have attempted to release part 363 // of a physical page, but the likely effect of that is that it released 364 // the whole physical page, some of which may have still been in-use. 365 // This could lead to memory corruption. Throw. 366 throw("released less than one physical page of memory") 367 } 368 369 if crit < minCritTime { 370 // This means there wasn't enough work to actually fill up minCritTime. 371 // That's fine; we shouldn't try to do anything with this information 372 // because it's going result in a short enough sleep request that things 373 // will get messy. Just assume we did at least this much work. 374 // All this means is that we'll sleep longer than we otherwise would have. 375 crit = minCritTime 376 } 377 378 // Multiply the critical time by 1 + the ratio of the costs of using 379 // scavenged memory vs. scavenging memory. This forces us to pay down 380 // the cost of reusing this memory eagerly by sleeping for a longer period 381 // of time and scavenging less frequently. More concretely, we avoid situations 382 // where we end up scavenging so often that we hurt allocation performance 383 // because of the additional overheads of using scavenged memory. 384 crit *= 1 + scavengeCostRatio 385 386 // Go to sleep for our current sleepNS. 387 slept := scavengeSleep(int64(crit / critSleepRatio)) 388 389 // Calculate the CPU time spent. 390 // 391 // This may be slightly inaccurate with respect to GOMAXPROCS, but we're 392 // recomputing this often enough relative to GOMAXPROCS changes in general 393 // (it only changes when the world is stopped, and not during a GC) that 394 // that small inaccuracy is in the noise. 395 cpuFraction := float64(crit) / ((float64(slept) + crit) * float64(gomaxprocs)) 396 397 // Update the critSleepRatio, adjusting until we reach our ideal fraction. 398 critSleepRatio = critSleepController.next(cpuFraction, idealFraction, float64(slept)+crit) 399 } 400 } 401 402 // scavenge scavenges nbytes worth of free pages, starting with the 403 // highest address first. Successive calls continue from where it left 404 // off until the heap is exhausted. Call scavengeStartGen to bring it 405 // back to the top of the heap. 406 // 407 // Returns the amount of memory scavenged in bytes. 408 func (p *pageAlloc) scavenge(nbytes uintptr) uintptr { 409 var ( 410 addrs addrRange 411 gen uint32 412 ) 413 released := uintptr(0) 414 for released < nbytes { 415 if addrs.size() == 0 { 416 if addrs, gen = p.scavengeReserve(); addrs.size() == 0 { 417 break 418 } 419 } 420 systemstack(func() { 421 r, a := p.scavengeOne(addrs, nbytes-released) 422 released += r 423 addrs = a 424 }) 425 } 426 // Only unreserve the space which hasn't been scavenged or searched 427 // to ensure we always make progress. 428 p.scavengeUnreserve(addrs, gen) 429 return released 430 } 431 432 // printScavTrace prints a scavenge trace line to standard error. 433 // 434 // released should be the amount of memory released since the last time this 435 // was called, and forced indicates whether the scavenge was forced by the 436 // application. 437 func printScavTrace(gen uint32, released uintptr, forced bool) { 438 printlock() 439 print("scav ", gen, " ", 440 released>>10, " KiB work, ", 441 atomic.Load64(&memstats.heap_released)>>10, " KiB total, ", 442 (atomic.Load64(&memstats.heap_inuse)*100)/heapRetained(), "% util", 443 ) 444 if forced { 445 print(" (forced)") 446 } 447 println() 448 printunlock() 449 } 450 451 // scavengeStartGen starts a new scavenge generation, resetting 452 // the scavenger's search space to the full in-use address space. 453 // 454 // p.mheapLock must be held. 455 // 456 // Must run on the system stack because p.mheapLock must be held. 457 // 458 //go:systemstack 459 func (p *pageAlloc) scavengeStartGen() { 460 assertLockHeld(p.mheapLock) 461 462 lock(&p.scav.lock) 463 if debug.scavtrace > 0 { 464 printScavTrace(p.scav.gen, atomic.Loaduintptr(&p.scav.released), false) 465 } 466 p.inUse.cloneInto(&p.scav.inUse) 467 468 // Pick the new starting address for the scavenger cycle. 469 var startAddr offAddr 470 if p.scav.scavLWM.lessThan(p.scav.freeHWM) { 471 // The "free" high watermark exceeds the "scavenged" low watermark, 472 // so there are free scavengable pages in parts of the address space 473 // that the scavenger already searched, the high watermark being the 474 // highest one. Pick that as our new starting point to ensure we 475 // see those pages. 476 startAddr = p.scav.freeHWM 477 } else { 478 // The "free" high watermark does not exceed the "scavenged" low 479 // watermark. This means the allocator didn't free any memory in 480 // the range we scavenged last cycle, so we might as well continue 481 // scavenging from where we were. 482 startAddr = p.scav.scavLWM 483 } 484 p.scav.inUse.removeGreaterEqual(startAddr.addr()) 485 486 // reservationBytes may be zero if p.inUse.totalBytes is small, or if 487 // scavengeReservationShards is large. This case is fine as the scavenger 488 // will simply be turned off, but it does mean that scavengeReservationShards, 489 // in concert with pallocChunkBytes, dictates the minimum heap size at which 490 // the scavenger triggers. In practice this minimum is generally less than an 491 // arena in size, so virtually every heap has the scavenger on. 492 p.scav.reservationBytes = alignUp(p.inUse.totalBytes, pallocChunkBytes) / scavengeReservationShards 493 p.scav.gen++ 494 atomic.Storeuintptr(&p.scav.released, 0) 495 p.scav.freeHWM = minOffAddr 496 p.scav.scavLWM = maxOffAddr 497 unlock(&p.scav.lock) 498 } 499 500 // scavengeReserve reserves a contiguous range of the address space 501 // for scavenging. The maximum amount of space it reserves is proportional 502 // to the size of the heap. The ranges are reserved from the high addresses 503 // first. 504 // 505 // Returns the reserved range and the scavenge generation number for it. 506 func (p *pageAlloc) scavengeReserve() (addrRange, uint32) { 507 lock(&p.scav.lock) 508 gen := p.scav.gen 509 510 // Start by reserving the minimum. 511 r := p.scav.inUse.removeLast(p.scav.reservationBytes) 512 513 // Return early if the size is zero; we don't want to use 514 // the bogus address below. 515 if r.size() == 0 { 516 unlock(&p.scav.lock) 517 return r, gen 518 } 519 520 // The scavenger requires that base be aligned to a 521 // palloc chunk because that's the unit of operation for 522 // the scavenger, so align down, potentially extending 523 // the range. 524 newBase := alignDown(r.base.addr(), pallocChunkBytes) 525 526 // Remove from inUse however much extra we just pulled out. 527 p.scav.inUse.removeGreaterEqual(newBase) 528 unlock(&p.scav.lock) 529 530 r.base = offAddr{newBase} 531 return r, gen 532 } 533 534 // scavengeUnreserve returns an unscavenged portion of a range that was 535 // previously reserved with scavengeReserve. 536 func (p *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) { 537 if r.size() == 0 { 538 return 539 } 540 if r.base.addr()%pallocChunkBytes != 0 { 541 throw("unreserving unaligned region") 542 } 543 lock(&p.scav.lock) 544 if gen == p.scav.gen { 545 p.scav.inUse.add(r) 546 } 547 unlock(&p.scav.lock) 548 } 549 550 // scavengeOne walks over address range work until it finds 551 // a contiguous run of pages to scavenge. It will try to scavenge 552 // at most max bytes at once, but may scavenge more to avoid 553 // breaking huge pages. Once it scavenges some memory it returns 554 // how much it scavenged in bytes. 555 // 556 // Returns the number of bytes scavenged and the part of work 557 // which was not yet searched. 558 // 559 // work's base address must be aligned to pallocChunkBytes. 560 // 561 // Must run on the systemstack because it acquires p.mheapLock. 562 // 563 //go:systemstack 564 func (p *pageAlloc) scavengeOne(work addrRange, max uintptr) (uintptr, addrRange) { 565 // Defensively check if we've received an empty address range. 566 // If so, just return. 567 if work.size() == 0 { 568 // Nothing to do. 569 return 0, work 570 } 571 // Check the prerequisites of work. 572 if work.base.addr()%pallocChunkBytes != 0 { 573 throw("scavengeOne called with unaligned work region") 574 } 575 // Calculate the maximum number of pages to scavenge. 576 // 577 // This should be alignUp(max, pageSize) / pageSize but max can and will 578 // be ^uintptr(0), so we need to be very careful not to overflow here. 579 // Rather than use alignUp, calculate the number of pages rounded down 580 // first, then add back one if necessary. 581 maxPages := max / pageSize 582 if max%pageSize != 0 { 583 maxPages++ 584 } 585 586 // Calculate the minimum number of pages we can scavenge. 587 // 588 // Because we can only scavenge whole physical pages, we must 589 // ensure that we scavenge at least minPages each time, aligned 590 // to minPages*pageSize. 591 minPages := physPageSize / pageSize 592 if minPages < 1 { 593 minPages = 1 594 } 595 596 // Fast path: check the chunk containing the top-most address in work. 597 if r, w := p.scavengeOneFast(work, minPages, maxPages); r != 0 { 598 return r, w 599 } else { 600 work = w 601 } 602 603 // findCandidate finds the next scavenge candidate in work optimistically. 604 // 605 // Returns the candidate chunk index and true on success, and false on failure. 606 // 607 // The heap need not be locked. 608 findCandidate := func(work addrRange) (chunkIdx, bool) { 609 // Iterate over this work's chunks. 610 for i := chunkIndex(work.limit.addr() - 1); i >= chunkIndex(work.base.addr()); i-- { 611 // If this chunk is totally in-use or has no unscavenged pages, don't bother 612 // doing a more sophisticated check. 613 // 614 // Note we're accessing the summary and the chunks without a lock, but 615 // that's fine. We're being optimistic anyway. 616 617 // Check quickly if there are enough free pages at all. 618 if p.summary[len(p.summary)-1][i].max() < uint(minPages) { 619 continue 620 } 621 622 // Run over the chunk looking harder for a candidate. Again, we could 623 // race with a lot of different pieces of code, but we're just being 624 // optimistic. Make sure we load the l2 pointer atomically though, to 625 // avoid races with heap growth. It may or may not be possible to also 626 // see a nil pointer in this case if we do race with heap growth, but 627 // just defensively ignore the nils. This operation is optimistic anyway. 628 l2 := (*[1 << pallocChunksL2Bits]pallocData)(atomic.Loadp(unsafe.Pointer(&p.chunks[i.l1()]))) 629 if l2 != nil && l2[i.l2()].hasScavengeCandidate(minPages) { 630 return i, true 631 } 632 } 633 return 0, false 634 } 635 636 // Slow path: iterate optimistically over the in-use address space 637 // looking for any free and unscavenged page. If we think we see something, 638 // lock and verify it! 639 for work.size() != 0 { 640 641 // Search for the candidate. 642 candidateChunkIdx, ok := findCandidate(work) 643 if !ok { 644 // We didn't find a candidate, so we're done. 645 work.limit = work.base 646 break 647 } 648 649 // Lock, so we can verify what we found. 650 lock(p.mheapLock) 651 652 // Find, verify, and scavenge if we can. 653 chunk := p.chunkOf(candidateChunkIdx) 654 base, npages := chunk.findScavengeCandidate(pallocChunkPages-1, minPages, maxPages) 655 if npages > 0 { 656 work.limit = offAddr{p.scavengeRangeLocked(candidateChunkIdx, base, npages)} 657 unlock(p.mheapLock) 658 return uintptr(npages) * pageSize, work 659 } 660 unlock(p.mheapLock) 661 662 // We were fooled, so let's continue from where we left off. 663 work.limit = offAddr{chunkBase(candidateChunkIdx)} 664 } 665 return 0, work 666 } 667 668 // scavengeOneFast is the fast path for scavengeOne, which just checks the top 669 // chunk of work for some pages to scavenge. 670 // 671 // Must run on the system stack because it acquires the heap lock. 672 // 673 //go:systemstack 674 func (p *pageAlloc) scavengeOneFast(work addrRange, minPages, maxPages uintptr) (uintptr, addrRange) { 675 maxAddr := work.limit.addr() - 1 676 maxChunk := chunkIndex(maxAddr) 677 678 lock(p.mheapLock) 679 if p.summary[len(p.summary)-1][maxChunk].max() >= uint(minPages) { 680 // We only bother looking for a candidate if there at least 681 // minPages free pages at all. 682 base, npages := p.chunkOf(maxChunk).findScavengeCandidate(chunkPageIndex(maxAddr), minPages, maxPages) 683 684 // If we found something, scavenge it and return! 685 if npages != 0 { 686 work.limit = offAddr{p.scavengeRangeLocked(maxChunk, base, npages)} 687 unlock(p.mheapLock) 688 return uintptr(npages) * pageSize, work 689 } 690 } 691 unlock(p.mheapLock) 692 693 // Update the limit to reflect the fact that we checked maxChunk already. 694 work.limit = offAddr{chunkBase(maxChunk)} 695 return 0, work 696 } 697 698 // scavengeRangeLocked scavenges the given region of memory. 699 // The region of memory is described by its chunk index (ci), 700 // the starting page index of the region relative to that 701 // chunk (base), and the length of the region in pages (npages). 702 // 703 // Returns the base address of the scavenged region. 704 // 705 // p.mheapLock must be held. Unlocks p.mheapLock but reacquires 706 // it before returning. Must be run on the systemstack as a result. 707 // 708 //go:systemstack 709 func (p *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) uintptr { 710 assertLockHeld(p.mheapLock) 711 712 // Compute the full address for the start of the range. 713 addr := chunkBase(ci) + uintptr(base)*pageSize 714 715 // Mark the range we're about to scavenge as allocated, because 716 // we don't want any allocating goroutines to grab it while 717 // the scavenging is in progress. 718 if scav := p.allocRange(addr, uintptr(npages)); scav != 0 { 719 throw("double scavenge") 720 } 721 722 // With that done, it's safe to unlock. 723 unlock(p.mheapLock) 724 725 // Update the scavenge low watermark. 726 lock(&p.scav.lock) 727 if oAddr := (offAddr{addr}); oAddr.lessThan(p.scav.scavLWM) { 728 p.scav.scavLWM = oAddr 729 } 730 unlock(&p.scav.lock) 731 732 if !p.test { 733 // Only perform the actual scavenging if we're not in a test. 734 // It's dangerous to do so otherwise. 735 sysUnused(unsafe.Pointer(addr), uintptr(npages)*pageSize) 736 737 // Update global accounting only when not in test, otherwise 738 // the runtime's accounting will be wrong. 739 nbytes := int64(npages) * pageSize 740 atomic.Xadd64(&memstats.heap_released, nbytes) 741 742 // Update consistent accounting too. 743 stats := memstats.heapStats.acquire() 744 atomic.Xaddint64(&stats.committed, -nbytes) 745 atomic.Xaddint64(&stats.released, nbytes) 746 memstats.heapStats.release() 747 } 748 749 // Relock the heap, because now we need to make these pages 750 // available allocation. Free them back to the page allocator. 751 lock(p.mheapLock) 752 p.free(addr, uintptr(npages), true) 753 754 // Mark the range as scavenged. 755 p.chunkOf(ci).scavenged.setRange(base, npages) 756 return addr 757 } 758 759 // fillAligned returns x but with all zeroes in m-aligned 760 // groups of m bits set to 1 if any bit in the group is non-zero. 761 // 762 // For example, fillAligned(0x0100a3, 8) == 0xff00ff. 763 // 764 // Note that if m == 1, this is a no-op. 765 // 766 // m must be a power of 2 <= maxPagesPerPhysPage. 767 func fillAligned(x uint64, m uint) uint64 { 768 apply := func(x uint64, c uint64) uint64 { 769 // The technique used it here is derived from 770 // https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord 771 // and extended for more than just bytes (like nibbles 772 // and uint16s) by using an appropriate constant. 773 // 774 // To summarize the technique, quoting from that page: 775 // "[It] works by first zeroing the high bits of the [8] 776 // bytes in the word. Subsequently, it adds a number that 777 // will result in an overflow to the high bit of a byte if 778 // any of the low bits were initially set. Next the high 779 // bits of the original word are ORed with these values; 780 // thus, the high bit of a byte is set iff any bit in the 781 // byte was set. Finally, we determine if any of these high 782 // bits are zero by ORing with ones everywhere except the 783 // high bits and inverting the result." 784 return ^((((x & c) + c) | x) | c) 785 } 786 // Transform x to contain a 1 bit at the top of each m-aligned 787 // group of m zero bits. 788 switch m { 789 case 1: 790 return x 791 case 2: 792 x = apply(x, 0x5555555555555555) 793 case 4: 794 x = apply(x, 0x7777777777777777) 795 case 8: 796 x = apply(x, 0x7f7f7f7f7f7f7f7f) 797 case 16: 798 x = apply(x, 0x7fff7fff7fff7fff) 799 case 32: 800 x = apply(x, 0x7fffffff7fffffff) 801 case 64: // == maxPagesPerPhysPage 802 x = apply(x, 0x7fffffffffffffff) 803 default: 804 throw("bad m value") 805 } 806 // Now, the top bit of each m-aligned group in x is set 807 // that group was all zero in the original x. 808 809 // From each group of m bits subtract 1. 810 // Because we know only the top bits of each 811 // m-aligned group are set, we know this will 812 // set each group to have all the bits set except 813 // the top bit, so just OR with the original 814 // result to set all the bits. 815 return ^((x - (x >> (m - 1))) | x) 816 } 817 818 // hasScavengeCandidate returns true if there's any min-page-aligned groups of 819 // min pages of free-and-unscavenged memory in the region represented by this 820 // pallocData. 821 // 822 // min must be a non-zero power of 2 <= maxPagesPerPhysPage. 823 func (m *pallocData) hasScavengeCandidate(min uintptr) bool { 824 if min&(min-1) != 0 || min == 0 { 825 print("runtime: min = ", min, "\n") 826 throw("min must be a non-zero power of 2") 827 } else if min > maxPagesPerPhysPage { 828 print("runtime: min = ", min, "\n") 829 throw("min too large") 830 } 831 832 // The goal of this search is to see if the chunk contains any free and unscavenged memory. 833 for i := len(m.scavenged) - 1; i >= 0; i-- { 834 // 1s are scavenged OR non-free => 0s are unscavenged AND free 835 // 836 // TODO(mknyszek): Consider splitting up fillAligned into two 837 // functions, since here we technically could get by with just 838 // the first half of its computation. It'll save a few instructions 839 // but adds some additional code complexity. 840 x := fillAligned(m.scavenged[i]|m.pallocBits[i], uint(min)) 841 842 // Quickly skip over chunks of non-free or scavenged pages. 843 if x != ^uint64(0) { 844 return true 845 } 846 } 847 return false 848 } 849 850 // findScavengeCandidate returns a start index and a size for this pallocData 851 // segment which represents a contiguous region of free and unscavenged memory. 852 // 853 // searchIdx indicates the page index within this chunk to start the search, but 854 // note that findScavengeCandidate searches backwards through the pallocData. As a 855 // a result, it will return the highest scavenge candidate in address order. 856 // 857 // min indicates a hard minimum size and alignment for runs of pages. That is, 858 // findScavengeCandidate will not return a region smaller than min pages in size, 859 // or that is min pages or greater in size but not aligned to min. min must be 860 // a non-zero power of 2 <= maxPagesPerPhysPage. 861 // 862 // max is a hint for how big of a region is desired. If max >= pallocChunkPages, then 863 // findScavengeCandidate effectively returns entire free and unscavenged regions. 864 // If max < pallocChunkPages, it may truncate the returned region such that size is 865 // max. However, findScavengeCandidate may still return a larger region if, for 866 // example, it chooses to preserve huge pages, or if max is not aligned to min (it 867 // will round up). That is, even if max is small, the returned size is not guaranteed 868 // to be equal to max. max is allowed to be less than min, in which case it is as if 869 // max == min. 870 func (m *pallocData) findScavengeCandidate(searchIdx uint, min, max uintptr) (uint, uint) { 871 if min&(min-1) != 0 || min == 0 { 872 print("runtime: min = ", min, "\n") 873 throw("min must be a non-zero power of 2") 874 } else if min > maxPagesPerPhysPage { 875 print("runtime: min = ", min, "\n") 876 throw("min too large") 877 } 878 // max may not be min-aligned, so we might accidentally truncate to 879 // a max value which causes us to return a non-min-aligned value. 880 // To prevent this, align max up to a multiple of min (which is always 881 // a power of 2). This also prevents max from ever being less than 882 // min, unless it's zero, so handle that explicitly. 883 if max == 0 { 884 max = min 885 } else { 886 max = alignUp(max, min) 887 } 888 889 i := int(searchIdx / 64) 890 // Start by quickly skipping over blocks of non-free or scavenged pages. 891 for ; i >= 0; i-- { 892 // 1s are scavenged OR non-free => 0s are unscavenged AND free 893 x := fillAligned(m.scavenged[i]|m.pallocBits[i], uint(min)) 894 if x != ^uint64(0) { 895 break 896 } 897 } 898 if i < 0 { 899 // Failed to find any free/unscavenged pages. 900 return 0, 0 901 } 902 // We have something in the 64-bit chunk at i, but it could 903 // extend further. Loop until we find the extent of it. 904 905 // 1s are scavenged OR non-free => 0s are unscavenged AND free 906 x := fillAligned(m.scavenged[i]|m.pallocBits[i], uint(min)) 907 z1 := uint(sys.LeadingZeros64(^x)) 908 run, end := uint(0), uint(i)*64+(64-z1) 909 if x<<z1 != 0 { 910 // After shifting out z1 bits, we still have 1s, 911 // so the run ends inside this word. 912 run = uint(sys.LeadingZeros64(x << z1)) 913 } else { 914 // After shifting out z1 bits, we have no more 1s. 915 // This means the run extends to the bottom of the 916 // word so it may extend into further words. 917 run = 64 - z1 918 for j := i - 1; j >= 0; j-- { 919 x := fillAligned(m.scavenged[j]|m.pallocBits[j], uint(min)) 920 run += uint(sys.LeadingZeros64(x)) 921 if x != 0 { 922 // The run stopped in this word. 923 break 924 } 925 } 926 } 927 928 // Split the run we found if it's larger than max but hold on to 929 // our original length, since we may need it later. 930 size := run 931 if size > uint(max) { 932 size = uint(max) 933 } 934 start := end - size 935 936 // Each huge page is guaranteed to fit in a single palloc chunk. 937 // 938 // TODO(mknyszek): Support larger huge page sizes. 939 // TODO(mknyszek): Consider taking pages-per-huge-page as a parameter 940 // so we can write tests for this. 941 if physHugePageSize > pageSize && physHugePageSize > physPageSize { 942 // We have huge pages, so let's ensure we don't break one by scavenging 943 // over a huge page boundary. If the range [start, start+size) overlaps with 944 // a free-and-unscavenged huge page, we want to grow the region we scavenge 945 // to include that huge page. 946 947 // Compute the huge page boundary above our candidate. 948 pagesPerHugePage := uintptr(physHugePageSize / pageSize) 949 hugePageAbove := uint(alignUp(uintptr(start), pagesPerHugePage)) 950 951 // If that boundary is within our current candidate, then we may be breaking 952 // a huge page. 953 if hugePageAbove <= end { 954 // Compute the huge page boundary below our candidate. 955 hugePageBelow := uint(alignDown(uintptr(start), pagesPerHugePage)) 956 957 if hugePageBelow >= end-run { 958 // We're in danger of breaking apart a huge page since start+size crosses 959 // a huge page boundary and rounding down start to the nearest huge 960 // page boundary is included in the full run we found. Include the entire 961 // huge page in the bound by rounding down to the huge page size. 962 size = size + (start - hugePageBelow) 963 start = hugePageBelow 964 } 965 } 966 } 967 return start, size 968 }