github.com/ice-blockchain/go/src@v0.0.0-20240403114104-1564d284e521/runtime/trace2.go (about) 1 // Copyright 2023 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build goexperiment.exectracer2 6 7 // Go execution tracer. 8 // The tracer captures a wide range of execution events like goroutine 9 // creation/blocking/unblocking, syscall enter/exit/block, GC-related events, 10 // changes of heap size, processor start/stop, etc and writes them to a buffer 11 // in a compact form. A precise nanosecond-precision timestamp and a stack 12 // trace is captured for most events. 13 // 14 // Tracer invariants (to keep the synchronization making sense): 15 // - An m that has a trace buffer must be on either the allm or sched.freem lists. 16 // - Any trace buffer mutation must either be happening in traceAdvance or between 17 // a traceAcquire and a subsequent traceRelease. 18 // - traceAdvance cannot return until the previous generation's buffers are all flushed. 19 // 20 // See https://go.dev/issue/60773 for a link to the full design. 21 22 package runtime 23 24 import ( 25 "runtime/internal/atomic" 26 "unsafe" 27 ) 28 29 // Trace state. 30 31 // trace is global tracing context. 32 var trace struct { 33 // trace.lock must only be acquired on the system stack where 34 // stack splits cannot happen while it is held. 35 lock mutex 36 37 // Trace buffer management. 38 // 39 // First we check the empty list for any free buffers. If not, buffers 40 // are allocated directly from the OS. Once they're filled up and/or 41 // flushed, they end up on the full queue for trace.gen%2. 42 // 43 // The trace reader takes buffers off the full list one-by-one and 44 // places them into reading until they're finished being read from. 45 // Then they're placed onto the empty list. 46 // 47 // Protected by trace.lock. 48 reading *traceBuf // buffer currently handed off to user 49 empty *traceBuf // stack of empty buffers 50 full [2]traceBufQueue 51 workAvailable atomic.Bool 52 53 // State for the trace reader goroutine. 54 // 55 // Protected by trace.lock. 56 readerGen atomic.Uintptr // the generation the reader is currently reading for 57 flushedGen atomic.Uintptr // the last completed generation 58 headerWritten bool // whether ReadTrace has emitted trace header 59 60 // doneSema is used to synchronize the reader and traceAdvance. Specifically, 61 // it notifies traceAdvance that the reader is done with a generation. 62 // Both semaphores are 0 by default (so, acquires block). traceAdvance 63 // attempts to acquire for gen%2 after flushing the last buffers for gen. 64 // Meanwhile the reader releases the sema for gen%2 when it has finished 65 // processing gen. 66 doneSema [2]uint32 67 68 // Trace data tables for deduplicating data going into the trace. 69 // There are 2 of each: one for gen%2, one for 1-gen%2. 70 stackTab [2]traceStackTable // maps stack traces to unique ids 71 stringTab [2]traceStringTable // maps strings to unique ids 72 73 // cpuLogRead accepts CPU profile samples from the signal handler where 74 // they're generated. There are two profBufs here: one for gen%2, one for 75 // 1-gen%2. These profBufs use a three-word header to hold the IDs of the P, G, 76 // and M (respectively) that were active at the time of the sample. Because 77 // profBuf uses a record with all zeros in its header to indicate overflow, 78 // we make sure to make the P field always non-zero: The ID of a real P will 79 // start at bit 1, and bit 0 will be set. Samples that arrive while no P is 80 // running (such as near syscalls) will set the first header field to 0b10. 81 // This careful handling of the first header field allows us to store ID of 82 // the active G directly in the second field, even though that will be 0 83 // when sampling g0. 84 // 85 // Initialization and teardown of these fields is protected by traceAdvanceSema. 86 cpuLogRead [2]*profBuf 87 signalLock atomic.Uint32 // protects use of the following member, only usable in signal handlers 88 cpuLogWrite [2]atomic.Pointer[profBuf] // copy of cpuLogRead for use in signal handlers, set without signalLock 89 cpuSleep *wakeableSleep 90 cpuLogDone <-chan struct{} 91 cpuBuf [2]*traceBuf 92 93 reader atomic.Pointer[g] // goroutine that called ReadTrace, or nil 94 95 // Fast mappings from enumerations to string IDs that are prepopulated 96 // in the trace. 97 markWorkerLabels [2][len(gcMarkWorkerModeStrings)]traceArg 98 goStopReasons [2][len(traceGoStopReasonStrings)]traceArg 99 goBlockReasons [2][len(traceBlockReasonStrings)]traceArg 100 101 // enabled indicates whether tracing is enabled, but it is only an optimization, 102 // NOT the source of truth on whether tracing is enabled. Tracing is only truly 103 // enabled if gen != 0. This is used as an optimistic fast path check. 104 // 105 // Transitioning this value from true -> false is easy (once gen is 0) 106 // because it's OK for enabled to have a stale "true" value. traceAcquire will 107 // always double-check gen. 108 // 109 // Transitioning this value from false -> true is harder. We need to make sure 110 // this is observable as true strictly before gen != 0. To maintain this invariant 111 // we only make this transition with the world stopped and use the store to gen 112 // as a publication barrier. 113 enabled bool 114 115 // Trace generation counter. 116 gen atomic.Uintptr 117 lastNonZeroGen uintptr // last non-zero value of gen 118 119 // shutdown is set when we are waiting for trace reader to finish after setting gen to 0 120 // 121 // Writes protected by trace.lock. 122 shutdown atomic.Bool 123 124 // Number of goroutines in syscall exiting slow path. 125 exitingSyscall atomic.Int32 126 127 // seqGC is the sequence counter for GC begin/end. 128 // 129 // Mutated only during stop-the-world. 130 seqGC uint64 131 } 132 133 // Trace public API. 134 135 var ( 136 traceAdvanceSema uint32 = 1 137 traceShutdownSema uint32 = 1 138 ) 139 140 // StartTrace enables tracing for the current process. 141 // While tracing, the data will be buffered and available via [ReadTrace]. 142 // StartTrace returns an error if tracing is already enabled. 143 // Most clients should use the [runtime/trace] package or the [testing] package's 144 // -test.trace flag instead of calling StartTrace directly. 145 func StartTrace() error { 146 if traceEnabled() || traceShuttingDown() { 147 return errorString("tracing is already enabled") 148 } 149 // Block until cleanup of the last trace is done. 150 semacquire(&traceShutdownSema) 151 semrelease(&traceShutdownSema) 152 153 // Hold traceAdvanceSema across trace start, since we'll want it on 154 // the other side of tracing being enabled globally. 155 semacquire(&traceAdvanceSema) 156 157 // Initialize CPU profile -> trace ingestion. 158 traceInitReadCPU() 159 160 // Compute the first generation for this StartTrace. 161 // 162 // Note: we start from the last non-zero generation rather than 1 so we 163 // can avoid resetting all the arrays indexed by gen%2 or gen%3. There's 164 // more than one of each per m, p, and goroutine. 165 firstGen := traceNextGen(trace.lastNonZeroGen) 166 167 // Reset GC sequencer. 168 trace.seqGC = 1 169 170 // Reset trace reader state. 171 trace.headerWritten = false 172 trace.readerGen.Store(firstGen) 173 trace.flushedGen.Store(0) 174 175 // Register some basic strings in the string tables. 176 traceRegisterLabelsAndReasons(firstGen) 177 178 // Stop the world. 179 // 180 // The purpose of stopping the world is to make sure that no goroutine is in a 181 // context where it could emit an event by bringing all goroutines to a safe point 182 // with no opportunity to transition. 183 // 184 // The exception to this rule are goroutines that are concurrently exiting a syscall. 185 // Those will all be forced into the syscalling slow path, and we'll just make sure 186 // that we don't observe any goroutines in that critical section before starting 187 // the world again. 188 // 189 // A good follow-up question to this is why stopping the world is necessary at all 190 // given that we have traceAcquire and traceRelease. Unfortunately, those only help 191 // us when tracing is already active (for performance, so when tracing is off the 192 // tracing seqlock is left untouched). The main issue here is subtle: we're going to 193 // want to obtain a correct starting status for each goroutine, but there are windows 194 // of time in which we could read and emit an incorrect status. Specifically: 195 // 196 // trace := traceAcquire() 197 // // <----> problem window 198 // casgstatus(gp, _Gwaiting, _Grunnable) 199 // if trace.ok() { 200 // trace.GoUnpark(gp, 2) 201 // traceRelease(trace) 202 // } 203 // 204 // More precisely, if we readgstatus for a gp while another goroutine is in the problem 205 // window and that goroutine didn't observe that tracing had begun, then we might write 206 // a GoStatus(GoWaiting) event for that goroutine, but it won't trace an event marking 207 // the transition from GoWaiting to GoRunnable. The trace will then be broken, because 208 // future events will be emitted assuming the tracer sees GoRunnable. 209 // 210 // In short, what we really need here is to make sure that the next time *any goroutine* 211 // hits a traceAcquire, it sees that the trace is enabled. 212 // 213 // Note also that stopping the world is necessary to make sure sweep-related events are 214 // coherent. Since the world is stopped and sweeps are non-preemptible, we can never start 215 // the world and see an unpaired sweep 'end' event. Other parts of the tracer rely on this. 216 stw := stopTheWorld(stwStartTrace) 217 218 // Prevent sysmon from running any code that could generate events. 219 lock(&sched.sysmonlock) 220 221 // Reset mSyscallID on all Ps while we have them stationary and the trace is disabled. 222 for _, pp := range allp { 223 pp.trace.mSyscallID = -1 224 } 225 226 // Start tracing. 227 // 228 // Set trace.enabled. This is *very* subtle. We need to maintain the invariant that if 229 // trace.gen != 0, then trace.enabled is always observed as true. Simultaneously, for 230 // performance, we need trace.enabled to be read without any synchronization. 231 // 232 // We ensure this is safe by stopping the world, which acts a global barrier on almost 233 // every M, and explicitly synchronize with any other Ms that could be running concurrently 234 // with us. Today, there are only two such cases: 235 // - sysmon, which we synchronized with by acquiring sysmonlock. 236 // - goroutines exiting syscalls, which we synchronize with via trace.exitingSyscall. 237 // 238 // After trace.gen is updated, other Ms may start creating trace buffers and emitting 239 // data into them. 240 trace.enabled = true 241 trace.gen.Store(firstGen) 242 243 // Wait for exitingSyscall to drain. 244 // 245 // It may not monotonically decrease to zero, but in the limit it will always become 246 // zero because the world is stopped and there are no available Ps for syscall-exited 247 // goroutines to run on. 248 // 249 // Because we set gen before checking this, and because exitingSyscall is always incremented 250 // *before* traceAcquire (which checks gen), we can be certain that when exitingSyscall is zero 251 // that any goroutine that goes to exit a syscall from then on *must* observe the new gen as 252 // well as trace.enabled being set to true. 253 // 254 // The critical section on each goroutine here is going to be quite short, so the likelihood 255 // that we observe a zero value is high. 256 for trace.exitingSyscall.Load() != 0 { 257 osyield() 258 } 259 260 // Record some initial pieces of information. 261 // 262 // N.B. This will also emit a status event for this goroutine. 263 tl := traceAcquire() 264 tl.Gomaxprocs(gomaxprocs) // Get this as early in the trace as possible. See comment in traceAdvance. 265 tl.STWStart(stwStartTrace) // We didn't trace this above, so trace it now. 266 267 // Record the fact that a GC is active, if applicable. 268 if gcphase == _GCmark || gcphase == _GCmarktermination { 269 tl.GCActive() 270 } 271 272 // Record the heap goal so we have it at the very beginning of the trace. 273 tl.HeapGoal() 274 275 // Make sure a ProcStatus is emitted for every P, while we're here. 276 for _, pp := range allp { 277 tl.writer().writeProcStatusForP(pp, pp == tl.mp.p.ptr()).end() 278 } 279 traceRelease(tl) 280 281 unlock(&sched.sysmonlock) 282 startTheWorld(stw) 283 284 traceStartReadCPU() 285 traceAdvancer.start() 286 287 semrelease(&traceAdvanceSema) 288 return nil 289 } 290 291 // StopTrace stops tracing, if it was previously enabled. 292 // StopTrace only returns after all the reads for the trace have completed. 293 func StopTrace() { 294 traceAdvance(true) 295 } 296 297 // traceAdvance moves tracing to the next generation, and cleans up the current generation, 298 // ensuring that it's flushed out before returning. If stopTrace is true, it disables tracing 299 // altogether instead of advancing to the next generation. 300 // 301 // traceAdvanceSema must not be held. 302 func traceAdvance(stopTrace bool) { 303 semacquire(&traceAdvanceSema) 304 305 // Get the gen that we're advancing from. In this function we don't really care much 306 // about the generation we're advancing _into_ since we'll do all the cleanup in this 307 // generation for the next advancement. 308 gen := trace.gen.Load() 309 if gen == 0 { 310 // We may end up here traceAdvance is called concurrently with StopTrace. 311 semrelease(&traceAdvanceSema) 312 return 313 } 314 315 // Write an EvFrequency event for this generation. 316 // 317 // N.B. This may block for quite a while to get a good frequency estimate, so make sure we do 318 // this here and not e.g. on the trace reader. 319 traceFrequency(gen) 320 321 // Collect all the untraced Gs. 322 type untracedG struct { 323 gp *g 324 goid uint64 325 mid int64 326 status uint32 327 waitreason waitReason 328 inMarkAssist bool 329 } 330 var untracedGs []untracedG 331 forEachGRace(func(gp *g) { 332 // Make absolutely sure all Gs are ready for the next 333 // generation. We need to do this even for dead Gs because 334 // they may come alive with a new identity, and its status 335 // traced bookkeeping might end up being stale. 336 // We may miss totally new goroutines, but they'll always 337 // have clean bookkeeping. 338 gp.trace.readyNextGen(gen) 339 // If the status was traced, nothing else to do. 340 if gp.trace.statusWasTraced(gen) { 341 return 342 } 343 // Scribble down information about this goroutine. 344 ug := untracedG{gp: gp, mid: -1} 345 systemstack(func() { 346 me := getg().m.curg 347 // We don't have to handle this G status transition because we 348 // already eliminated ourselves from consideration above. 349 casGToWaiting(me, _Grunning, waitReasonTraceGoroutineStatus) 350 // We need to suspend and take ownership of the G to safely read its 351 // goid. Note that we can't actually emit the event at this point 352 // because we might stop the G in a window where it's unsafe to write 353 // events based on the G's status. We need the global trace buffer flush 354 // coming up to make sure we're not racing with the G. 355 // 356 // It should be very unlikely that we try to preempt a running G here. 357 // The only situation that we might is that we're racing with a G 358 // that's running for the first time in this generation. Therefore, 359 // this should be relatively fast. 360 s := suspendG(gp) 361 if !s.dead { 362 ug.goid = s.g.goid 363 if s.g.m != nil { 364 ug.mid = int64(s.g.m.procid) 365 } 366 ug.status = readgstatus(s.g) &^ _Gscan 367 ug.waitreason = s.g.waitreason 368 ug.inMarkAssist = s.g.inMarkAssist 369 } 370 resumeG(s) 371 casgstatus(me, _Gwaiting, _Grunning) 372 }) 373 if ug.goid != 0 { 374 untracedGs = append(untracedGs, ug) 375 } 376 }) 377 378 if !stopTrace { 379 // Re-register runtime goroutine labels and stop/block reasons. 380 traceRegisterLabelsAndReasons(traceNextGen(gen)) 381 } 382 383 // Now that we've done some of the heavy stuff, prevent the world from stopping. 384 // This is necessary to ensure the consistency of the STW events. If we're feeling 385 // adventurous we could lift this restriction and add a STWActive event, but the 386 // cost of maintaining this consistency is low. We're not going to hold this semaphore 387 // for very long and most STW periods are very short. 388 // Once we hold worldsema, prevent preemption as well so we're not interrupted partway 389 // through this. We want to get this done as soon as possible. 390 semacquire(&worldsema) 391 mp := acquirem() 392 393 // Advance the generation or stop the trace. 394 trace.lastNonZeroGen = gen 395 if stopTrace { 396 systemstack(func() { 397 // Ordering is important here. Set shutdown first, then disable tracing, 398 // so that conditions like (traceEnabled() || traceShuttingDown()) have 399 // no opportunity to be false. Hold the trace lock so this update appears 400 // atomic to the trace reader. 401 lock(&trace.lock) 402 trace.shutdown.Store(true) 403 trace.gen.Store(0) 404 unlock(&trace.lock) 405 406 // Clear trace.enabled. It is totally OK for this value to be stale, 407 // because traceAcquire will always double-check gen. 408 trace.enabled = false 409 }) 410 } else { 411 trace.gen.Store(traceNextGen(gen)) 412 } 413 414 // Emit a ProcsChange event so we have one on record for each generation. 415 // Let's emit it as soon as possible so that downstream tools can rely on the value 416 // being there fairly soon in a generation. 417 // 418 // It's important that we do this before allowing stop-the-worlds again, 419 // because the procs count could change. 420 if !stopTrace { 421 tl := traceAcquire() 422 tl.Gomaxprocs(gomaxprocs) 423 traceRelease(tl) 424 } 425 426 // Emit a GCActive event in the new generation if necessary. 427 // 428 // It's important that we do this before allowing stop-the-worlds again, 429 // because that could emit global GC-related events. 430 if !stopTrace && (gcphase == _GCmark || gcphase == _GCmarktermination) { 431 tl := traceAcquire() 432 tl.GCActive() 433 traceRelease(tl) 434 } 435 436 // Preemption is OK again after this. If the world stops or whatever it's fine. 437 // We're just cleaning up the last generation after this point. 438 // 439 // We also don't care if the GC starts again after this for the same reasons. 440 releasem(mp) 441 semrelease(&worldsema) 442 443 // Snapshot allm and freem. 444 // 445 // Snapshotting after the generation counter update is sufficient. 446 // Because an m must be on either allm or sched.freem if it has an active trace 447 // buffer, new threads added to allm after this point must necessarily observe 448 // the new generation number (sched.lock acts as a barrier). 449 // 450 // Threads that exit before this point and are on neither list explicitly 451 // flush their own buffers in traceThreadDestroy. 452 // 453 // Snapshotting freem is necessary because Ms can continue to emit events 454 // while they're still on that list. Removal from sched.freem is serialized with 455 // this snapshot, so either we'll capture an m on sched.freem and race with 456 // the removal to flush its buffers (resolved by traceThreadDestroy acquiring 457 // the thread's seqlock, which one of us must win, so at least its old gen buffer 458 // will be flushed in time for the new generation) or it will have flushed its 459 // buffers before we snapshotted it to begin with. 460 lock(&sched.lock) 461 mToFlush := allm 462 for mp := mToFlush; mp != nil; mp = mp.alllink { 463 mp.trace.link = mp.alllink 464 } 465 for mp := sched.freem; mp != nil; mp = mp.freelink { 466 mp.trace.link = mToFlush 467 mToFlush = mp 468 } 469 unlock(&sched.lock) 470 471 // Iterate over our snapshot, flushing every buffer until we're done. 472 // 473 // Because trace writers read the generation while the seqlock is 474 // held, we can be certain that when there are no writers there are 475 // also no stale generation values left. Therefore, it's safe to flush 476 // any buffers that remain in that generation's slot. 477 const debugDeadlock = false 478 systemstack(func() { 479 // Track iterations for some rudimentary deadlock detection. 480 i := 0 481 detectedDeadlock := false 482 483 for mToFlush != nil { 484 prev := &mToFlush 485 for mp := *prev; mp != nil; { 486 if mp.trace.seqlock.Load()%2 != 0 { 487 // The M is writing. Come back to it later. 488 prev = &mp.trace.link 489 mp = mp.trace.link 490 continue 491 } 492 // Flush the trace buffer. 493 // 494 // trace.lock needed for traceBufFlush, but also to synchronize 495 // with traceThreadDestroy, which flushes both buffers unconditionally. 496 lock(&trace.lock) 497 bufp := &mp.trace.buf[gen%2] 498 if *bufp != nil { 499 traceBufFlush(*bufp, gen) 500 *bufp = nil 501 } 502 unlock(&trace.lock) 503 504 // Remove the m from the flush list. 505 *prev = mp.trace.link 506 mp.trace.link = nil 507 mp = *prev 508 } 509 // Yield only if we're going to be going around the loop again. 510 if mToFlush != nil { 511 osyield() 512 } 513 514 if debugDeadlock { 515 // Try to detect a deadlock. We probably shouldn't loop here 516 // this many times. 517 if i > 100000 && !detectedDeadlock { 518 detectedDeadlock = true 519 println("runtime: failing to flush") 520 for mp := mToFlush; mp != nil; mp = mp.trace.link { 521 print("runtime: m=", mp.id, "\n") 522 } 523 } 524 i++ 525 } 526 } 527 }) 528 529 // At this point, the old generation is fully flushed minus stack and string 530 // tables, CPU samples, and goroutines that haven't run at all during the last 531 // generation. 532 533 // Check to see if any Gs still haven't had events written out for them. 534 statusWriter := unsafeTraceWriter(gen, nil) 535 for _, ug := range untracedGs { 536 if ug.gp.trace.statusWasTraced(gen) { 537 // It was traced, we don't need to do anything. 538 continue 539 } 540 // It still wasn't traced. Because we ensured all Ms stopped writing trace 541 // events to the last generation, that must mean the G never had its status 542 // traced in gen between when we recorded it and now. If that's true, the goid 543 // and status we recorded then is exactly what we want right now. 544 status := goStatusToTraceGoStatus(ug.status, ug.waitreason) 545 statusWriter = statusWriter.writeGoStatus(ug.goid, ug.mid, status, ug.inMarkAssist) 546 } 547 statusWriter.flush().end() 548 549 // Read everything out of the last gen's CPU profile buffer. 550 traceReadCPU(gen) 551 552 systemstack(func() { 553 // Flush CPU samples, stacks, and strings for the last generation. This is safe, 554 // because we're now certain no M is writing to the last generation. 555 // 556 // Ordering is important here. traceCPUFlush may generate new stacks and dumping 557 // stacks may generate new strings. 558 traceCPUFlush(gen) 559 trace.stackTab[gen%2].dump(gen) 560 trace.stringTab[gen%2].reset(gen) 561 562 // That's it. This generation is done producing buffers. 563 lock(&trace.lock) 564 trace.flushedGen.Store(gen) 565 unlock(&trace.lock) 566 }) 567 568 // Perform status reset on dead Ps because they just appear as idle. 569 // 570 // Preventing preemption is sufficient to access allp safely. allp is only 571 // mutated by GOMAXPROCS calls, which require a STW. 572 // 573 // TODO(mknyszek): Consider explicitly emitting ProcCreate and ProcDestroy 574 // events to indicate whether a P exists, rather than just making its 575 // existence implicit. 576 mp = acquirem() 577 for _, pp := range allp[len(allp):cap(allp)] { 578 pp.trace.readyNextGen(traceNextGen(gen)) 579 } 580 releasem(mp) 581 582 if stopTrace { 583 // Acquire the shutdown sema to begin the shutdown process. 584 semacquire(&traceShutdownSema) 585 586 // Finish off CPU profile reading. 587 traceStopReadCPU() 588 } else { 589 // Go over each P and emit a status event for it if necessary. 590 // 591 // We do this at the beginning of the new generation instead of the 592 // end like we do for goroutines because forEachP doesn't give us a 593 // hook to skip Ps that have already been traced. Since we have to 594 // preempt all Ps anyway, might as well stay consistent with StartTrace 595 // which does this during the STW. 596 semacquire(&worldsema) 597 forEachP(waitReasonTraceProcStatus, func(pp *p) { 598 tl := traceAcquire() 599 if !pp.trace.statusWasTraced(tl.gen) { 600 tl.writer().writeProcStatusForP(pp, false).end() 601 } 602 traceRelease(tl) 603 }) 604 semrelease(&worldsema) 605 } 606 607 // Block until the trace reader has finished processing the last generation. 608 semacquire(&trace.doneSema[gen%2]) 609 if raceenabled { 610 raceacquire(unsafe.Pointer(&trace.doneSema[gen%2])) 611 } 612 613 // Double-check that things look as we expect after advancing and perform some 614 // final cleanup if the trace has fully stopped. 615 systemstack(func() { 616 lock(&trace.lock) 617 if !trace.full[gen%2].empty() { 618 throw("trace: non-empty full trace buffer for done generation") 619 } 620 if stopTrace { 621 if !trace.full[1-(gen%2)].empty() { 622 throw("trace: non-empty full trace buffer for next generation") 623 } 624 if trace.reading != nil || trace.reader.Load() != nil { 625 throw("trace: reading after shutdown") 626 } 627 // Free all the empty buffers. 628 for trace.empty != nil { 629 buf := trace.empty 630 trace.empty = buf.link 631 sysFree(unsafe.Pointer(buf), unsafe.Sizeof(*buf), &memstats.other_sys) 632 } 633 // Clear trace.shutdown and other flags. 634 trace.headerWritten = false 635 trace.shutdown.Store(false) 636 } 637 unlock(&trace.lock) 638 }) 639 640 if stopTrace { 641 // Clear the sweep state on every P for the next time tracing is enabled. 642 // 643 // It may be stale in the next trace because we may have ended tracing in 644 // the middle of a sweep on a P. 645 // 646 // It's fine not to call forEachP here because tracing is disabled and we 647 // know at this point that nothing is calling into the tracer, but we do 648 // need to look at dead Ps too just because GOMAXPROCS could have been called 649 // at any point since we stopped tracing, and we have to ensure there's no 650 // bad state on dead Ps too. Prevent a STW and a concurrent GOMAXPROCS that 651 // might mutate allp by making ourselves briefly non-preemptible. 652 mp := acquirem() 653 for _, pp := range allp[:cap(allp)] { 654 pp.trace.inSweep = false 655 pp.trace.maySweep = false 656 pp.trace.swept = 0 657 pp.trace.reclaimed = 0 658 } 659 releasem(mp) 660 } 661 662 // Release the advance semaphore. If stopTrace is true we're still holding onto 663 // traceShutdownSema. 664 // 665 // Do a direct handoff. Don't let one caller of traceAdvance starve 666 // other calls to traceAdvance. 667 semrelease1(&traceAdvanceSema, true, 0) 668 669 if stopTrace { 670 // Stop the traceAdvancer. We can't be holding traceAdvanceSema here because 671 // we'll deadlock (we're blocked on the advancer goroutine exiting, but it 672 // may be currently trying to acquire traceAdvanceSema). 673 traceAdvancer.stop() 674 semrelease(&traceShutdownSema) 675 } 676 } 677 678 func traceNextGen(gen uintptr) uintptr { 679 if gen == ^uintptr(0) { 680 // gen is used both %2 and %3 and we want both patterns to continue when we loop around. 681 // ^uint32(0) and ^uint64(0) are both odd and multiples of 3. Therefore the next generation 682 // we want is even and one more than a multiple of 3. The smallest such number is 4. 683 return 4 684 } 685 return gen + 1 686 } 687 688 // traceRegisterLabelsAndReasons re-registers mark worker labels and 689 // goroutine stop/block reasons in the string table for the provided 690 // generation. Note: the provided generation must not have started yet. 691 func traceRegisterLabelsAndReasons(gen uintptr) { 692 for i, label := range gcMarkWorkerModeStrings[:] { 693 trace.markWorkerLabels[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, label)) 694 } 695 for i, str := range traceBlockReasonStrings[:] { 696 trace.goBlockReasons[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, str)) 697 } 698 for i, str := range traceGoStopReasonStrings[:] { 699 trace.goStopReasons[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, str)) 700 } 701 } 702 703 // ReadTrace returns the next chunk of binary tracing data, blocking until data 704 // is available. If tracing is turned off and all the data accumulated while it 705 // was on has been returned, ReadTrace returns nil. The caller must copy the 706 // returned data before calling ReadTrace again. 707 // ReadTrace must be called from one goroutine at a time. 708 func ReadTrace() []byte { 709 top: 710 var buf []byte 711 var park bool 712 systemstack(func() { 713 buf, park = readTrace0() 714 }) 715 if park { 716 gopark(func(gp *g, _ unsafe.Pointer) bool { 717 if !trace.reader.CompareAndSwapNoWB(nil, gp) { 718 // We're racing with another reader. 719 // Wake up and handle this case. 720 return false 721 } 722 723 if g2 := traceReader(); gp == g2 { 724 // New data arrived between unlocking 725 // and the CAS and we won the wake-up 726 // race, so wake up directly. 727 return false 728 } else if g2 != nil { 729 printlock() 730 println("runtime: got trace reader", g2, g2.goid) 731 throw("unexpected trace reader") 732 } 733 734 return true 735 }, nil, waitReasonTraceReaderBlocked, traceBlockSystemGoroutine, 2) 736 goto top 737 } 738 739 return buf 740 } 741 742 // readTrace0 is ReadTrace's continuation on g0. This must run on the 743 // system stack because it acquires trace.lock. 744 // 745 //go:systemstack 746 func readTrace0() (buf []byte, park bool) { 747 if raceenabled { 748 // g0 doesn't have a race context. Borrow the user G's. 749 if getg().racectx != 0 { 750 throw("expected racectx == 0") 751 } 752 getg().racectx = getg().m.curg.racectx 753 // (This defer should get open-coded, which is safe on 754 // the system stack.) 755 defer func() { getg().racectx = 0 }() 756 } 757 758 // This function must not allocate while holding trace.lock: 759 // allocation can call heap allocate, which will try to emit a trace 760 // event while holding heap lock. 761 lock(&trace.lock) 762 763 if trace.reader.Load() != nil { 764 // More than one goroutine reads trace. This is bad. 765 // But we rather do not crash the program because of tracing, 766 // because tracing can be enabled at runtime on prod servers. 767 unlock(&trace.lock) 768 println("runtime: ReadTrace called from multiple goroutines simultaneously") 769 return nil, false 770 } 771 // Recycle the old buffer. 772 if buf := trace.reading; buf != nil { 773 buf.link = trace.empty 774 trace.empty = buf 775 trace.reading = nil 776 } 777 // Write trace header. 778 if !trace.headerWritten { 779 trace.headerWritten = true 780 unlock(&trace.lock) 781 return []byte("go 1.23 trace\x00\x00\x00"), false 782 } 783 784 // Read the next buffer. 785 786 if trace.readerGen.Load() == 0 { 787 trace.readerGen.Store(1) 788 } 789 var gen uintptr 790 for { 791 assertLockHeld(&trace.lock) 792 gen = trace.readerGen.Load() 793 794 // Check to see if we need to block for more data in this generation 795 // or if we need to move our generation forward. 796 if !trace.full[gen%2].empty() { 797 break 798 } 799 // Most of the time readerGen is one generation ahead of flushedGen, as the 800 // current generation is being read from. Then, once the last buffer is flushed 801 // into readerGen, flushedGen will rise to meet it. At this point, the tracer 802 // is waiting on the reader to finish flushing the last generation so that it 803 // can continue to advance. 804 if trace.flushedGen.Load() == gen { 805 if trace.shutdown.Load() { 806 unlock(&trace.lock) 807 808 // Wake up anyone waiting for us to be done with this generation. 809 // 810 // Do this after reading trace.shutdown, because the thread we're 811 // waking up is going to clear trace.shutdown. 812 if raceenabled { 813 // Model synchronization on trace.doneSema, which te race 814 // detector does not see. This is required to avoid false 815 // race reports on writer passed to trace.Start. 816 racerelease(unsafe.Pointer(&trace.doneSema[gen%2])) 817 } 818 semrelease(&trace.doneSema[gen%2]) 819 820 // We're shutting down, and the last generation is fully 821 // read. We're done. 822 return nil, false 823 } 824 // The previous gen has had all of its buffers flushed, and 825 // there's nothing else for us to read. Advance the generation 826 // we're reading from and try again. 827 trace.readerGen.Store(trace.gen.Load()) 828 unlock(&trace.lock) 829 830 // Wake up anyone waiting for us to be done with this generation. 831 // 832 // Do this after reading gen to make sure we can't have the trace 833 // advance until we've read it. 834 if raceenabled { 835 // See comment above in the shutdown case. 836 racerelease(unsafe.Pointer(&trace.doneSema[gen%2])) 837 } 838 semrelease(&trace.doneSema[gen%2]) 839 840 // Reacquire the lock and go back to the top of the loop. 841 lock(&trace.lock) 842 continue 843 } 844 // Wait for new data. 845 // 846 // We don't simply use a note because the scheduler 847 // executes this goroutine directly when it wakes up 848 // (also a note would consume an M). 849 // 850 // Before we drop the lock, clear the workAvailable flag. Work can 851 // only be queued with trace.lock held, so this is at least true until 852 // we drop the lock. 853 trace.workAvailable.Store(false) 854 unlock(&trace.lock) 855 return nil, true 856 } 857 // Pull a buffer. 858 tbuf := trace.full[gen%2].pop() 859 trace.reading = tbuf 860 unlock(&trace.lock) 861 return tbuf.arr[:tbuf.pos], false 862 } 863 864 // traceReader returns the trace reader that should be woken up, if any. 865 // Callers should first check (traceEnabled() || traceShuttingDown()). 866 // 867 // This must run on the system stack because it acquires trace.lock. 868 // 869 //go:systemstack 870 func traceReader() *g { 871 gp := traceReaderAvailable() 872 if gp == nil || !trace.reader.CompareAndSwapNoWB(gp, nil) { 873 return nil 874 } 875 return gp 876 } 877 878 // traceReaderAvailable returns the trace reader if it is not currently 879 // scheduled and should be. Callers should first check that 880 // (traceEnabled() || traceShuttingDown()) is true. 881 func traceReaderAvailable() *g { 882 // There are three conditions under which we definitely want to schedule 883 // the reader: 884 // - The reader is lagging behind in finishing off the last generation. 885 // In this case, trace buffers could even be empty, but the trace 886 // advancer will be waiting on the reader, so we have to make sure 887 // to schedule the reader ASAP. 888 // - The reader has pending work to process for it's reader generation 889 // (assuming readerGen is not lagging behind). Note that we also want 890 // to be careful *not* to schedule the reader if there's no work to do. 891 // - The trace is shutting down. The trace stopper blocks on the reader 892 // to finish, much like trace advancement. 893 // 894 // We also want to be careful not to schedule the reader if there's no 895 // reason to. 896 if trace.flushedGen.Load() == trace.readerGen.Load() || trace.workAvailable.Load() || trace.shutdown.Load() { 897 return trace.reader.Load() 898 } 899 return nil 900 } 901 902 // Trace advancer goroutine. 903 var traceAdvancer traceAdvancerState 904 905 type traceAdvancerState struct { 906 timer *wakeableSleep 907 done chan struct{} 908 } 909 910 // start starts a new traceAdvancer. 911 func (s *traceAdvancerState) start() { 912 // Start a goroutine to periodically advance the trace generation. 913 s.done = make(chan struct{}) 914 s.timer = newWakeableSleep() 915 go func() { 916 for traceEnabled() { 917 // Set a timer to wake us up 918 s.timer.sleep(int64(debug.traceadvanceperiod)) 919 920 // Try to advance the trace. 921 traceAdvance(false) 922 } 923 s.done <- struct{}{} 924 }() 925 } 926 927 // stop stops a traceAdvancer and blocks until it exits. 928 func (s *traceAdvancerState) stop() { 929 s.timer.wake() 930 <-s.done 931 close(s.done) 932 s.timer.close() 933 } 934 935 // traceAdvancePeriod is the approximate period between 936 // new generations. 937 const defaultTraceAdvancePeriod = 1e9 // 1 second. 938 939 // wakeableSleep manages a wakeable goroutine sleep. 940 // 941 // Users of this type must call init before first use and 942 // close to free up resources. Once close is called, init 943 // must be called before another use. 944 type wakeableSleep struct { 945 timer *timer 946 947 // lock protects access to wakeup, but not send/recv on it. 948 lock mutex 949 wakeup chan struct{} 950 } 951 952 // newWakeableSleep initializes a new wakeableSleep and returns it. 953 func newWakeableSleep() *wakeableSleep { 954 s := new(wakeableSleep) 955 lockInit(&s.lock, lockRankWakeableSleep) 956 s.wakeup = make(chan struct{}, 1) 957 s.timer = new(timer) 958 f := func(s any, _ uintptr, _ int64) { 959 s.(*wakeableSleep).wake() 960 } 961 s.timer.init(f, s) 962 return s 963 } 964 965 // sleep sleeps for the provided duration in nanoseconds or until 966 // another goroutine calls wake. 967 // 968 // Must not be called by more than one goroutine at a time and 969 // must not be called concurrently with close. 970 func (s *wakeableSleep) sleep(ns int64) { 971 s.timer.reset(nanotime()+ns, 0) 972 lock(&s.lock) 973 if raceenabled { 974 raceacquire(unsafe.Pointer(&s.lock)) 975 } 976 wakeup := s.wakeup 977 if raceenabled { 978 racerelease(unsafe.Pointer(&s.lock)) 979 } 980 unlock(&s.lock) 981 <-wakeup 982 s.timer.stop() 983 } 984 985 // wake awakens any goroutine sleeping on the timer. 986 // 987 // Safe for concurrent use with all other methods. 988 func (s *wakeableSleep) wake() { 989 // Grab the wakeup channel, which may be nil if we're 990 // racing with close. 991 lock(&s.lock) 992 if raceenabled { 993 raceacquire(unsafe.Pointer(&s.lock)) 994 } 995 if s.wakeup != nil { 996 // Non-blocking send. 997 // 998 // Others may also write to this channel and we don't 999 // want to block on the receiver waking up. This also 1000 // effectively batches together wakeup notifications. 1001 select { 1002 case s.wakeup <- struct{}{}: 1003 default: 1004 } 1005 } 1006 if raceenabled { 1007 racerelease(unsafe.Pointer(&s.lock)) 1008 } 1009 unlock(&s.lock) 1010 } 1011 1012 // close wakes any goroutine sleeping on the timer and prevents 1013 // further sleeping on it. 1014 // 1015 // Once close is called, the wakeableSleep must no longer be used. 1016 // 1017 // It must only be called once no goroutine is sleeping on the 1018 // timer *and* nothing else will call wake concurrently. 1019 func (s *wakeableSleep) close() { 1020 // Set wakeup to nil so that a late timer ends up being a no-op. 1021 lock(&s.lock) 1022 if raceenabled { 1023 raceacquire(unsafe.Pointer(&s.lock)) 1024 } 1025 wakeup := s.wakeup 1026 s.wakeup = nil 1027 1028 // Close the channel. 1029 close(wakeup) 1030 1031 if raceenabled { 1032 racerelease(unsafe.Pointer(&s.lock)) 1033 } 1034 unlock(&s.lock) 1035 return 1036 }