github.com/go-asm/go@v1.21.1-0.20240213172139-40c5ead50c48/trace/v2/order.go (about) 1 // Copyright 2023 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package trace 6 7 import ( 8 "fmt" 9 "strings" 10 11 "github.com/go-asm/go/trace/v2/event" 12 "github.com/go-asm/go/trace/v2/event/go122" 13 "github.com/go-asm/go/trace/v2/version" 14 ) 15 16 // ordering emulates Go scheduler state for both validation and 17 // for putting events in the right order. 18 type ordering struct { 19 gStates map[GoID]*gState 20 pStates map[ProcID]*pState // TODO: The keys are dense, so this can be a slice. 21 mStates map[ThreadID]*mState 22 activeTasks map[TaskID]taskState 23 gcSeq uint64 24 gcState gcState 25 initialGen uint64 26 27 // Some events like GoDestroySyscall produce two events instead of one. 28 // extraEvent is this extra space. advance must not be called unless 29 // the extraEvent has been consumed with consumeExtraEvent. 30 // 31 // TODO(mknyszek): Replace this with a more formal queue. 32 extraEvent Event 33 } 34 35 // consumeExtraEvent consumes the extra event. 36 func (o *ordering) consumeExtraEvent() Event { 37 if o.extraEvent.Kind() == EventBad { 38 return Event{} 39 } 40 r := o.extraEvent 41 o.extraEvent = Event{} 42 return r 43 } 44 45 // advance checks if it's valid to proceed with ev which came from thread m. 46 // 47 // Returns the schedCtx at the point of the event, whether it's OK to advance 48 // with this event, and any error encountered in validation. 49 // 50 // It assumes the gen value passed to it is monotonically increasing across calls. 51 // 52 // If any error is returned, then the trace is broken and trace parsing must cease. 53 // If it's not valid to advance with ev, but no error was encountered, the caller 54 // should attempt to advance with other candidate events from other threads. If the 55 // caller runs out of candidates, the trace is invalid. 56 func (o *ordering) advance(ev *baseEvent, evt *evTable, m ThreadID, gen uint64) (schedCtx, bool, error) { 57 if o.initialGen == 0 { 58 // Set the initial gen if necessary. 59 o.initialGen = gen 60 } 61 62 var curCtx, newCtx schedCtx 63 curCtx.M = m 64 newCtx.M = m 65 66 if m == NoThread { 67 curCtx.P = NoProc 68 curCtx.G = NoGoroutine 69 newCtx = curCtx 70 } else { 71 // Pull out or create the mState for this event. 72 ms, ok := o.mStates[m] 73 if !ok { 74 ms = &mState{ 75 g: NoGoroutine, 76 p: NoProc, 77 } 78 o.mStates[m] = ms 79 } 80 curCtx.P = ms.p 81 curCtx.G = ms.g 82 newCtx = curCtx 83 defer func() { 84 // Update the mState for this event. 85 ms.p = newCtx.P 86 ms.g = newCtx.G 87 }() 88 } 89 90 switch typ := ev.typ; typ { 91 // Handle procs. 92 case go122.EvProcStatus: 93 pid := ProcID(ev.args[0]) 94 status := go122.ProcStatus(ev.args[1]) 95 if int(status) >= len(go122ProcStatus2ProcState) { 96 return curCtx, false, fmt.Errorf("invalid status for proc %d: %d", pid, status) 97 } 98 oldState := go122ProcStatus2ProcState[status] 99 if s, ok := o.pStates[pid]; ok { 100 if status == go122.ProcSyscallAbandoned && s.status == go122.ProcSyscall { 101 // ProcSyscallAbandoned is a special case of ProcSyscall. It indicates a 102 // potential loss of information, but if we're already in ProcSyscall, 103 // we haven't lost the relevant information. Promote the status and advance. 104 oldState = ProcRunning 105 ev.args[1] = uint64(go122.ProcSyscall) 106 } else if status == go122.ProcSyscallAbandoned && s.status == go122.ProcSyscallAbandoned { 107 // If we're passing through ProcSyscallAbandoned, then there's no promotion 108 // to do. We've lost the M that this P is associated with. However it got there, 109 // it's going to appear as idle in the API, so pass through as idle. 110 oldState = ProcIdle 111 ev.args[1] = uint64(go122.ProcSyscallAbandoned) 112 } else if s.status != status { 113 return curCtx, false, fmt.Errorf("inconsistent status for proc %d: old %v vs. new %v", pid, s.status, status) 114 } 115 s.seq = makeSeq(gen, 0) // Reset seq. 116 } else { 117 o.pStates[pid] = &pState{id: pid, status: status, seq: makeSeq(gen, 0)} 118 if gen == o.initialGen { 119 oldState = ProcUndetermined 120 } else { 121 oldState = ProcNotExist 122 } 123 } 124 ev.extra(version.Go122)[0] = uint64(oldState) // Smuggle in the old state for StateTransition. 125 126 // Bind the proc to the new context, if it's running. 127 if status == go122.ProcRunning || status == go122.ProcSyscall { 128 newCtx.P = pid 129 } 130 // If we're advancing through ProcSyscallAbandoned *but* oldState is running then we've 131 // promoted it to ProcSyscall. However, because it's ProcSyscallAbandoned, we know this 132 // P is about to get stolen and its status very likely isn't being emitted by the same 133 // thread it was bound to. Since this status is Running -> Running and Running is binding, 134 // we need to make sure we emit it in the right context: the context to which it is bound. 135 // Find it, and set our current context to it. 136 if status == go122.ProcSyscallAbandoned && oldState == ProcRunning { 137 // N.B. This is slow but it should be fairly rare. 138 found := false 139 for mid, ms := range o.mStates { 140 if ms.p == pid { 141 curCtx.M = mid 142 curCtx.P = pid 143 curCtx.G = ms.g 144 found = true 145 } 146 } 147 if !found { 148 return curCtx, false, fmt.Errorf("failed to find sched context for proc %d that's about to be stolen", pid) 149 } 150 } 151 return curCtx, true, nil 152 case go122.EvProcStart: 153 pid := ProcID(ev.args[0]) 154 seq := makeSeq(gen, ev.args[1]) 155 156 // Try to advance. We might fail here due to sequencing, because the P hasn't 157 // had a status emitted, or because we already have a P and we're in a syscall, 158 // and we haven't observed that it was stolen from us yet. 159 state, ok := o.pStates[pid] 160 if !ok || state.status != go122.ProcIdle || !seq.succeeds(state.seq) || curCtx.P != NoProc { 161 // We can't make an inference as to whether this is bad. We could just be seeing 162 // a ProcStart on a different M before the proc's state was emitted, or before we 163 // got to the right point in the trace. 164 // 165 // Note that we also don't advance here if we have a P and we're in a syscall. 166 return curCtx, false, nil 167 } 168 // We can advance this P. Check some invariants. 169 // 170 // We might have a goroutine if a goroutine is exiting a syscall. 171 reqs := event.SchedReqs{Thread: event.MustHave, Proc: event.MustNotHave, Goroutine: event.MayHave} 172 if err := validateCtx(curCtx, reqs); err != nil { 173 return curCtx, false, err 174 } 175 state.status = go122.ProcRunning 176 state.seq = seq 177 newCtx.P = pid 178 return curCtx, true, nil 179 case go122.EvProcStop: 180 // We must be able to advance this P. 181 // 182 // There are 2 ways a P can stop: ProcStop and ProcSteal. ProcStop is used when the P 183 // is stopped by the same M that started it, while ProcSteal is used when another M 184 // steals the P by stopping it from a distance. 185 // 186 // Since a P is bound to an M, and we're stopping on the same M we started, it must 187 // always be possible to advance the current M's P from a ProcStop. This is also why 188 // ProcStop doesn't need a sequence number. 189 state, ok := o.pStates[curCtx.P] 190 if !ok { 191 return curCtx, false, fmt.Errorf("event %s for proc (%v) that doesn't exist", go122.EventString(typ), curCtx.P) 192 } 193 if state.status != go122.ProcRunning && state.status != go122.ProcSyscall { 194 return curCtx, false, fmt.Errorf("%s event for proc that's not %s or %s", go122.EventString(typ), go122.ProcRunning, go122.ProcSyscall) 195 } 196 reqs := event.SchedReqs{Thread: event.MustHave, Proc: event.MustHave, Goroutine: event.MayHave} 197 if err := validateCtx(curCtx, reqs); err != nil { 198 return curCtx, false, err 199 } 200 state.status = go122.ProcIdle 201 newCtx.P = NoProc 202 return curCtx, true, nil 203 case go122.EvProcSteal: 204 pid := ProcID(ev.args[0]) 205 seq := makeSeq(gen, ev.args[1]) 206 state, ok := o.pStates[pid] 207 if !ok || (state.status != go122.ProcSyscall && state.status != go122.ProcSyscallAbandoned) || !seq.succeeds(state.seq) { 208 // We can't make an inference as to whether this is bad. We could just be seeing 209 // a ProcStart on a different M before the proc's state was emitted, or before we 210 // got to the right point in the trace. 211 return curCtx, false, nil 212 } 213 // We can advance this P. Check some invariants. 214 reqs := event.SchedReqs{Thread: event.MustHave, Proc: event.MayHave, Goroutine: event.MayHave} 215 if err := validateCtx(curCtx, reqs); err != nil { 216 return curCtx, false, err 217 } 218 // Smuggle in the P state that let us advance so we can surface information to the event. 219 // Specifically, we need to make sure that the event is interpreted not as a transition of 220 // ProcRunning -> ProcIdle but ProcIdle -> ProcIdle instead. 221 // 222 // ProcRunning is binding, but we may be running with a P on the current M and we can't 223 // bind another P. This P is about to go ProcIdle anyway. 224 oldStatus := state.status 225 ev.extra(version.Go122)[0] = uint64(oldStatus) 226 227 // Update the P's status and sequence number. 228 state.status = go122.ProcIdle 229 state.seq = seq 230 231 // If we've lost information then don't try to do anything with the M. 232 // It may have moved on and we can't be sure. 233 if oldStatus == go122.ProcSyscallAbandoned { 234 return curCtx, true, nil 235 } 236 237 // Validate that the M we're stealing from is what we expect. 238 mid := ThreadID(ev.args[2]) // The M we're stealing from. 239 240 if mid == curCtx.M { 241 // We're stealing from ourselves. This behaves like a ProcStop. 242 if curCtx.P != pid { 243 return curCtx, false, fmt.Errorf("tried to self-steal proc %d (thread %d), but got proc %d instead", pid, mid, curCtx.P) 244 } 245 newCtx.P = NoProc 246 return curCtx, true, nil 247 } 248 249 // We're stealing from some other M. 250 mState, ok := o.mStates[mid] 251 if !ok { 252 return curCtx, false, fmt.Errorf("stole proc from non-existent thread %d", mid) 253 } 254 255 // Make sure we're actually stealing the right P. 256 if mState.p != pid { 257 return curCtx, false, fmt.Errorf("tried to steal proc %d from thread %d, but got proc %d instead", pid, mid, mState.p) 258 } 259 260 // Tell the M it has no P so it can proceed. 261 // 262 // This is safe because we know the P was in a syscall and 263 // the other M must be trying to get out of the syscall. 264 // GoSyscallEndBlocked cannot advance until the corresponding 265 // M loses its P. 266 mState.p = NoProc 267 return curCtx, true, nil 268 269 // Handle goroutines. 270 case go122.EvGoStatus: 271 gid := GoID(ev.args[0]) 272 mid := ThreadID(ev.args[1]) 273 status := go122.GoStatus(ev.args[2]) 274 275 if int(status) >= len(go122GoStatus2GoState) { 276 return curCtx, false, fmt.Errorf("invalid status for goroutine %d: %d", gid, status) 277 } 278 oldState := go122GoStatus2GoState[status] 279 if s, ok := o.gStates[gid]; ok { 280 if s.status != status { 281 return curCtx, false, fmt.Errorf("inconsistent status for goroutine %d: old %v vs. new %v", gid, s.status, status) 282 } 283 s.seq = makeSeq(gen, 0) // Reset seq. 284 } else if gen == o.initialGen { 285 // Set the state. 286 o.gStates[gid] = &gState{id: gid, status: status, seq: makeSeq(gen, 0)} 287 oldState = GoUndetermined 288 } else { 289 return curCtx, false, fmt.Errorf("found goroutine status for new goroutine after the first generation: id=%v status=%v", gid, status) 290 } 291 ev.extra(version.Go122)[0] = uint64(oldState) // Smuggle in the old state for StateTransition. 292 293 switch status { 294 case go122.GoRunning: 295 // Bind the goroutine to the new context, since it's running. 296 newCtx.G = gid 297 case go122.GoSyscall: 298 if mid == NoThread { 299 return curCtx, false, fmt.Errorf("found goroutine %d in syscall without a thread", gid) 300 } 301 // Is the syscall on this thread? If so, bind it to the context. 302 // Otherwise, we're talking about a G sitting in a syscall on an M. 303 // Validate the named M. 304 if mid == curCtx.M { 305 if gen != o.initialGen && curCtx.G != gid { 306 // If this isn't the first generation, we *must* have seen this 307 // binding occur already. Even if the G was blocked in a syscall 308 // for multiple generations since trace start, we would have seen 309 // a previous GoStatus event that bound the goroutine to an M. 310 return curCtx, false, fmt.Errorf("inconsistent thread for syscalling goroutine %d: thread has goroutine %d", gid, curCtx.G) 311 } 312 newCtx.G = gid 313 break 314 } 315 // Now we're talking about a thread and goroutine that have been 316 // blocked on a syscall for the entire generation. This case must 317 // not have a P; the runtime makes sure that all Ps are traced at 318 // the beginning of a generation, which involves taking a P back 319 // from every thread. 320 ms, ok := o.mStates[mid] 321 if ok { 322 // This M has been seen. That means we must have seen this 323 // goroutine go into a syscall on this thread at some point. 324 if ms.g != gid { 325 // But the G on the M doesn't match. Something's wrong. 326 return curCtx, false, fmt.Errorf("inconsistent thread for syscalling goroutine %d: thread has goroutine %d", gid, ms.g) 327 } 328 // This case is just a Syscall->Syscall event, which needs to 329 // appear as having the G currently bound to this M. 330 curCtx.G = ms.g 331 } else if !ok { 332 // The M hasn't been seen yet. That means this goroutine 333 // has just been sitting in a syscall on this M. Create 334 // a state for it. 335 o.mStates[mid] = &mState{g: gid, p: NoProc} 336 // Don't set curCtx.G in this case because this event is the 337 // binding event (and curCtx represents the "before" state). 338 } 339 // Update the current context to the M we're talking about. 340 curCtx.M = mid 341 } 342 return curCtx, true, nil 343 case go122.EvGoCreate: 344 // Goroutines must be created on a running P, but may or may not be created 345 // by a running goroutine. 346 reqs := event.SchedReqs{Thread: event.MustHave, Proc: event.MustHave, Goroutine: event.MayHave} 347 if err := validateCtx(curCtx, reqs); err != nil { 348 return curCtx, false, err 349 } 350 // If we have a goroutine, it must be running. 351 if state, ok := o.gStates[curCtx.G]; ok && state.status != go122.GoRunning { 352 return curCtx, false, fmt.Errorf("%s event for goroutine that's not %s", go122.EventString(typ), GoRunning) 353 } 354 // This goroutine created another. Add a state for it. 355 newgid := GoID(ev.args[0]) 356 if _, ok := o.gStates[newgid]; ok { 357 return curCtx, false, fmt.Errorf("tried to create goroutine (%v) that already exists", newgid) 358 } 359 o.gStates[newgid] = &gState{id: newgid, status: go122.GoRunnable, seq: makeSeq(gen, 0)} 360 return curCtx, true, nil 361 case go122.EvGoDestroy, go122.EvGoStop, go122.EvGoBlock: 362 // These are goroutine events that all require an active running 363 // goroutine on some thread. They must *always* be advance-able, 364 // since running goroutines are bound to their M. 365 if err := validateCtx(curCtx, event.UserGoReqs); err != nil { 366 return curCtx, false, err 367 } 368 state, ok := o.gStates[curCtx.G] 369 if !ok { 370 return curCtx, false, fmt.Errorf("event %s for goroutine (%v) that doesn't exist", go122.EventString(typ), curCtx.G) 371 } 372 if state.status != go122.GoRunning { 373 return curCtx, false, fmt.Errorf("%s event for goroutine that's not %s", go122.EventString(typ), GoRunning) 374 } 375 // Handle each case slightly differently; we just group them together 376 // because they have shared preconditions. 377 switch typ { 378 case go122.EvGoDestroy: 379 // This goroutine is exiting itself. 380 delete(o.gStates, curCtx.G) 381 newCtx.G = NoGoroutine 382 case go122.EvGoStop: 383 // Goroutine stopped (yielded). It's runnable but not running on this M. 384 state.status = go122.GoRunnable 385 newCtx.G = NoGoroutine 386 case go122.EvGoBlock: 387 // Goroutine blocked. It's waiting now and not running on this M. 388 state.status = go122.GoWaiting 389 newCtx.G = NoGoroutine 390 } 391 return curCtx, true, nil 392 case go122.EvGoStart: 393 gid := GoID(ev.args[0]) 394 seq := makeSeq(gen, ev.args[1]) 395 state, ok := o.gStates[gid] 396 if !ok || state.status != go122.GoRunnable || !seq.succeeds(state.seq) { 397 // We can't make an inference as to whether this is bad. We could just be seeing 398 // a GoStart on a different M before the goroutine was created, before it had its 399 // state emitted, or before we got to the right point in the trace yet. 400 return curCtx, false, nil 401 } 402 // We can advance this goroutine. Check some invariants. 403 reqs := event.SchedReqs{Thread: event.MustHave, Proc: event.MustHave, Goroutine: event.MustNotHave} 404 if err := validateCtx(curCtx, reqs); err != nil { 405 return curCtx, false, err 406 } 407 state.status = go122.GoRunning 408 state.seq = seq 409 newCtx.G = gid 410 return curCtx, true, nil 411 case go122.EvGoUnblock: 412 // N.B. These both reference the goroutine to unblock, not the current goroutine. 413 gid := GoID(ev.args[0]) 414 seq := makeSeq(gen, ev.args[1]) 415 state, ok := o.gStates[gid] 416 if !ok || state.status != go122.GoWaiting || !seq.succeeds(state.seq) { 417 // We can't make an inference as to whether this is bad. We could just be seeing 418 // a GoUnblock on a different M before the goroutine was created and blocked itself, 419 // before it had its state emitted, or before we got to the right point in the trace yet. 420 return curCtx, false, nil 421 } 422 state.status = go122.GoRunnable 423 state.seq = seq 424 // N.B. No context to validate. Basically anything can unblock 425 // a goroutine (e.g. sysmon). 426 return curCtx, true, nil 427 case go122.EvGoSyscallBegin: 428 // Entering a syscall requires an active running goroutine with a 429 // proc on some thread. It is always advancable. 430 if err := validateCtx(curCtx, event.UserGoReqs); err != nil { 431 return curCtx, false, err 432 } 433 state, ok := o.gStates[curCtx.G] 434 if !ok { 435 return curCtx, false, fmt.Errorf("event %s for goroutine (%v) that doesn't exist", go122.EventString(typ), curCtx.G) 436 } 437 if state.status != go122.GoRunning { 438 return curCtx, false, fmt.Errorf("%s event for goroutine that's not %s", go122.EventString(typ), GoRunning) 439 } 440 // Goroutine entered a syscall. It's still running on this P and M. 441 state.status = go122.GoSyscall 442 pState, ok := o.pStates[curCtx.P] 443 if !ok { 444 return curCtx, false, fmt.Errorf("uninitialized proc %d found during %s", curCtx.P, go122.EventString(typ)) 445 } 446 pState.status = go122.ProcSyscall 447 // Validate the P sequence number on the event and advance it. 448 // 449 // We have a P sequence number for what is supposed to be a goroutine event 450 // so that we can correctly model P stealing. Without this sequence number here, 451 // the syscall from which a ProcSteal event is stealing can be ambiguous in the 452 // face of broken timestamps. See the go122-syscall-steal-proc-ambiguous test for 453 // more details. 454 // 455 // Note that because this sequence number only exists as a tool for disambiguation, 456 // we can enforce that we have the right sequence number at this point; we don't need 457 // to back off and see if any other events will advance. This is a running P. 458 pSeq := makeSeq(gen, ev.args[0]) 459 if !pSeq.succeeds(pState.seq) { 460 return curCtx, false, fmt.Errorf("failed to advance %s: can't make sequence: %s -> %s", go122.EventString(typ), pState.seq, pSeq) 461 } 462 pState.seq = pSeq 463 return curCtx, true, nil 464 case go122.EvGoSyscallEnd: 465 // This event is always advance-able because it happens on the same 466 // thread that EvGoSyscallStart happened, and the goroutine can't leave 467 // that thread until its done. 468 if err := validateCtx(curCtx, event.UserGoReqs); err != nil { 469 return curCtx, false, err 470 } 471 state, ok := o.gStates[curCtx.G] 472 if !ok { 473 return curCtx, false, fmt.Errorf("event %s for goroutine (%v) that doesn't exist", go122.EventString(typ), curCtx.G) 474 } 475 if state.status != go122.GoSyscall { 476 return curCtx, false, fmt.Errorf("%s event for goroutine that's not %s", go122.EventString(typ), GoRunning) 477 } 478 state.status = go122.GoRunning 479 480 // Transfer the P back to running from syscall. 481 pState, ok := o.pStates[curCtx.P] 482 if !ok { 483 return curCtx, false, fmt.Errorf("uninitialized proc %d found during %s", curCtx.P, go122.EventString(typ)) 484 } 485 if pState.status != go122.ProcSyscall { 486 return curCtx, false, fmt.Errorf("expected proc %d in state %v, but got %v instead", curCtx.P, go122.ProcSyscall, pState.status) 487 } 488 pState.status = go122.ProcRunning 489 return curCtx, true, nil 490 case go122.EvGoSyscallEndBlocked: 491 // This event becomes advanceable when its P is not in a syscall state 492 // (lack of a P altogether is also acceptable for advancing). 493 // The transfer out of ProcSyscall can happen either voluntarily via 494 // ProcStop or involuntarily via ProcSteal. We may also acquire a new P 495 // before we get here (after the transfer out) but that's OK: that new 496 // P won't be in the ProcSyscall state anymore. 497 // 498 // Basically: while we have a preemptible P, don't advance, because we 499 // *know* from the event that we're going to lose it at some point during 500 // the syscall. We shouldn't advance until that happens. 501 if curCtx.P != NoProc { 502 pState, ok := o.pStates[curCtx.P] 503 if !ok { 504 return curCtx, false, fmt.Errorf("uninitialized proc %d found during %s", curCtx.P, go122.EventString(typ)) 505 } 506 if pState.status == go122.ProcSyscall { 507 return curCtx, false, nil 508 } 509 } 510 // As mentioned above, we may have a P here if we ProcStart 511 // before this event. 512 if err := validateCtx(curCtx, event.SchedReqs{Thread: event.MustHave, Proc: event.MayHave, Goroutine: event.MustHave}); err != nil { 513 return curCtx, false, err 514 } 515 state, ok := o.gStates[curCtx.G] 516 if !ok { 517 return curCtx, false, fmt.Errorf("event %s for goroutine (%v) that doesn't exist", go122.EventString(typ), curCtx.G) 518 } 519 if state.status != go122.GoSyscall { 520 return curCtx, false, fmt.Errorf("%s event for goroutine that's not %s", go122.EventString(typ), GoRunning) 521 } 522 newCtx.G = NoGoroutine 523 state.status = go122.GoRunnable 524 return curCtx, true, nil 525 case go122.EvGoCreateSyscall: 526 // This event indicates that a goroutine is effectively 527 // being created out of a cgo callback. Such a goroutine 528 // is 'created' in the syscall state. 529 if err := validateCtx(curCtx, event.SchedReqs{Thread: event.MustHave, Proc: event.MayHave, Goroutine: event.MustNotHave}); err != nil { 530 return curCtx, false, err 531 } 532 // This goroutine is effectively being created. Add a state for it. 533 newgid := GoID(ev.args[0]) 534 if _, ok := o.gStates[newgid]; ok { 535 return curCtx, false, fmt.Errorf("tried to create goroutine (%v) in syscall that already exists", newgid) 536 } 537 o.gStates[newgid] = &gState{id: newgid, status: go122.GoSyscall, seq: makeSeq(gen, 0)} 538 // Goroutine is executing. Bind it to the context. 539 newCtx.G = newgid 540 return curCtx, true, nil 541 case go122.EvGoDestroySyscall: 542 // This event indicates that a goroutine created for a 543 // cgo callback is disappearing, either because the callback 544 // ending or the C thread that called it is being destroyed. 545 // 546 // Also, treat this as if we lost our P too. 547 // The thread ID may be reused by the platform and we'll get 548 // really confused if we try to steal the P is this is running 549 // with later. The new M with the same ID could even try to 550 // steal back this P from itself! 551 // 552 // The runtime is careful to make sure that any GoCreateSyscall 553 // event will enter the runtime emitting events for reacquiring a P. 554 // 555 // Note: we might have a P here. The P might not be released 556 // eagerly by the runtime, and it might get stolen back later 557 // (or never again, if the program is going to exit). 558 if err := validateCtx(curCtx, event.SchedReqs{Thread: event.MustHave, Proc: event.MayHave, Goroutine: event.MustHave}); err != nil { 559 return curCtx, false, err 560 } 561 // Check to make sure the goroutine exists in the right state. 562 state, ok := o.gStates[curCtx.G] 563 if !ok { 564 return curCtx, false, fmt.Errorf("event %s for goroutine (%v) that doesn't exist", go122.EventString(typ), curCtx.G) 565 } 566 if state.status != go122.GoSyscall { 567 return curCtx, false, fmt.Errorf("%s event for goroutine that's not %v", go122.EventString(typ), GoSyscall) 568 } 569 // This goroutine is exiting itself. 570 delete(o.gStates, curCtx.G) 571 newCtx.G = NoGoroutine 572 573 // If we have a proc, then we're dissociating from it now. See the comment at the top of the case. 574 if curCtx.P != NoProc { 575 pState, ok := o.pStates[curCtx.P] 576 if !ok { 577 return curCtx, false, fmt.Errorf("found invalid proc %d during %s", curCtx.P, go122.EventString(typ)) 578 } 579 if pState.status != go122.ProcSyscall { 580 return curCtx, false, fmt.Errorf("proc %d in unexpected state %s during %s", curCtx.P, pState.status, go122.EventString(typ)) 581 } 582 // See the go122-create-syscall-reuse-thread-id test case for more details. 583 pState.status = go122.ProcSyscallAbandoned 584 newCtx.P = NoProc 585 586 // Queue an extra self-ProcSteal event. 587 o.extraEvent = Event{ 588 table: evt, 589 ctx: curCtx, 590 base: baseEvent{ 591 typ: go122.EvProcSteal, 592 time: ev.time, 593 }, 594 } 595 o.extraEvent.base.args[0] = uint64(curCtx.P) 596 o.extraEvent.base.extra(version.Go122)[0] = uint64(go122.ProcSyscall) 597 } 598 return curCtx, true, nil 599 600 // Handle tasks. Tasks are interesting because: 601 // - There's no Begin event required to reference a task. 602 // - End for a particular task ID can appear multiple times. 603 // As a result, there's very little to validate. The only 604 // thing we have to be sure of is that a task didn't begin 605 // after it had already begun. Task IDs are allowed to be 606 // reused, so we don't care about a Begin after an End. 607 case go122.EvUserTaskBegin: 608 id := TaskID(ev.args[0]) 609 if _, ok := o.activeTasks[id]; ok { 610 return curCtx, false, fmt.Errorf("task ID conflict: %d", id) 611 } 612 // Get the parent ID, but don't validate it. There's no guarantee 613 // we actually have information on whether it's active. 614 parentID := TaskID(ev.args[1]) 615 if parentID == BackgroundTask { 616 // Note: a value of 0 here actually means no parent, *not* the 617 // background task. Automatic background task attachment only 618 // applies to regions. 619 parentID = NoTask 620 ev.args[1] = uint64(NoTask) 621 } 622 623 // Validate the name and record it. We'll need to pass it through to 624 // EvUserTaskEnd. 625 nameID := stringID(ev.args[2]) 626 name, ok := evt.strings.get(nameID) 627 if !ok { 628 return curCtx, false, fmt.Errorf("invalid string ID %v for %v event", nameID, typ) 629 } 630 o.activeTasks[id] = taskState{name: name, parentID: parentID} 631 return curCtx, true, validateCtx(curCtx, event.UserGoReqs) 632 case go122.EvUserTaskEnd: 633 id := TaskID(ev.args[0]) 634 if ts, ok := o.activeTasks[id]; ok { 635 // Smuggle the task info. This may happen in a different generation, 636 // which may not have the name in its string table. Add it to the extra 637 // strings table so we can look it up later. 638 ev.extra(version.Go122)[0] = uint64(ts.parentID) 639 ev.extra(version.Go122)[1] = uint64(evt.addExtraString(ts.name)) 640 delete(o.activeTasks, id) 641 } else { 642 // Explicitly clear the task info. 643 ev.extra(version.Go122)[0] = uint64(NoTask) 644 ev.extra(version.Go122)[1] = uint64(evt.addExtraString("")) 645 } 646 return curCtx, true, validateCtx(curCtx, event.UserGoReqs) 647 648 // Handle user regions. 649 case go122.EvUserRegionBegin: 650 if err := validateCtx(curCtx, event.UserGoReqs); err != nil { 651 return curCtx, false, err 652 } 653 tid := TaskID(ev.args[0]) 654 nameID := stringID(ev.args[1]) 655 name, ok := evt.strings.get(nameID) 656 if !ok { 657 return curCtx, false, fmt.Errorf("invalid string ID %v for %v event", nameID, typ) 658 } 659 gState, ok := o.gStates[curCtx.G] 660 if !ok { 661 return curCtx, false, fmt.Errorf("encountered EvUserRegionBegin without known state for current goroutine %d", curCtx.G) 662 } 663 if err := gState.beginRegion(userRegion{tid, name}); err != nil { 664 return curCtx, false, err 665 } 666 return curCtx, true, nil 667 case go122.EvUserRegionEnd: 668 if err := validateCtx(curCtx, event.UserGoReqs); err != nil { 669 return curCtx, false, err 670 } 671 tid := TaskID(ev.args[0]) 672 nameID := stringID(ev.args[1]) 673 name, ok := evt.strings.get(nameID) 674 if !ok { 675 return curCtx, false, fmt.Errorf("invalid string ID %v for %v event", nameID, typ) 676 } 677 gState, ok := o.gStates[curCtx.G] 678 if !ok { 679 return curCtx, false, fmt.Errorf("encountered EvUserRegionEnd without known state for current goroutine %d", curCtx.G) 680 } 681 if err := gState.endRegion(userRegion{tid, name}); err != nil { 682 return curCtx, false, err 683 } 684 return curCtx, true, nil 685 686 // Handle the GC mark phase. 687 // 688 // We have sequence numbers for both start and end because they 689 // can happen on completely different threads. We want an explicit 690 // partial order edge between start and end here, otherwise we're 691 // relying entirely on timestamps to make sure we don't advance a 692 // GCEnd for a _different_ GC cycle if timestamps are wildly broken. 693 case go122.EvGCActive: 694 seq := ev.args[0] 695 if gen == o.initialGen { 696 if o.gcState != gcUndetermined { 697 return curCtx, false, fmt.Errorf("GCActive in the first generation isn't first GC event") 698 } 699 o.gcSeq = seq 700 o.gcState = gcRunning 701 return curCtx, true, nil 702 } 703 if seq != o.gcSeq+1 { 704 // This is not the right GC cycle. 705 return curCtx, false, nil 706 } 707 if o.gcState != gcRunning { 708 return curCtx, false, fmt.Errorf("encountered GCActive while GC was not in progress") 709 } 710 o.gcSeq = seq 711 if err := validateCtx(curCtx, event.UserGoReqs); err != nil { 712 return curCtx, false, err 713 } 714 return curCtx, true, nil 715 case go122.EvGCBegin: 716 seq := ev.args[0] 717 if o.gcState == gcUndetermined { 718 o.gcSeq = seq 719 o.gcState = gcRunning 720 return curCtx, true, nil 721 } 722 if seq != o.gcSeq+1 { 723 // This is not the right GC cycle. 724 return curCtx, false, nil 725 } 726 if o.gcState == gcRunning { 727 return curCtx, false, fmt.Errorf("encountered GCBegin while GC was already in progress") 728 } 729 o.gcSeq = seq 730 o.gcState = gcRunning 731 if err := validateCtx(curCtx, event.UserGoReqs); err != nil { 732 return curCtx, false, err 733 } 734 return curCtx, true, nil 735 case go122.EvGCEnd: 736 seq := ev.args[0] 737 if seq != o.gcSeq+1 { 738 // This is not the right GC cycle. 739 return curCtx, false, nil 740 } 741 if o.gcState == gcNotRunning { 742 return curCtx, false, fmt.Errorf("encountered GCEnd when GC was not in progress") 743 } 744 if o.gcState == gcUndetermined { 745 return curCtx, false, fmt.Errorf("encountered GCEnd when GC was in an undetermined state") 746 } 747 o.gcSeq = seq 748 o.gcState = gcNotRunning 749 if err := validateCtx(curCtx, event.UserGoReqs); err != nil { 750 return curCtx, false, err 751 } 752 return curCtx, true, nil 753 754 // Handle simple instantaneous events that require a G. 755 case go122.EvGoLabel, go122.EvProcsChange, go122.EvUserLog: 756 if err := validateCtx(curCtx, event.UserGoReqs); err != nil { 757 return curCtx, false, err 758 } 759 return curCtx, true, nil 760 761 // Handle allocation states, which don't require a G. 762 case go122.EvHeapAlloc, go122.EvHeapGoal: 763 if err := validateCtx(curCtx, event.SchedReqs{Thread: event.MustHave, Proc: event.MustHave, Goroutine: event.MayHave}); err != nil { 764 return curCtx, false, err 765 } 766 return curCtx, true, nil 767 768 // Handle sweep, which is bound to a P and doesn't require a G. 769 case go122.EvGCSweepBegin: 770 if err := validateCtx(curCtx, event.SchedReqs{Thread: event.MustHave, Proc: event.MustHave, Goroutine: event.MayHave}); err != nil { 771 return curCtx, false, err 772 } 773 if err := o.pStates[curCtx.P].beginRange(makeRangeType(typ, 0)); err != nil { 774 return curCtx, false, err 775 } 776 return curCtx, true, nil 777 case go122.EvGCSweepActive: 778 pid := ProcID(ev.args[0]) 779 // N.B. In practice Ps can't block while they're sweeping, so this can only 780 // ever reference curCtx.P. However, be lenient about this like we are with 781 // GCMarkAssistActive; there's no reason the runtime couldn't change to block 782 // in the middle of a sweep. 783 pState, ok := o.pStates[pid] 784 if !ok { 785 return curCtx, false, fmt.Errorf("encountered GCSweepActive for unknown proc %d", pid) 786 } 787 if err := pState.activeRange(makeRangeType(typ, 0), gen == o.initialGen); err != nil { 788 return curCtx, false, err 789 } 790 return curCtx, true, nil 791 case go122.EvGCSweepEnd: 792 if err := validateCtx(curCtx, event.SchedReqs{Thread: event.MustHave, Proc: event.MustHave, Goroutine: event.MayHave}); err != nil { 793 return curCtx, false, err 794 } 795 _, err := o.pStates[curCtx.P].endRange(typ) 796 if err != nil { 797 return curCtx, false, err 798 } 799 return curCtx, true, nil 800 801 // Handle special goroutine-bound event ranges. 802 case go122.EvSTWBegin, go122.EvGCMarkAssistBegin: 803 if err := validateCtx(curCtx, event.UserGoReqs); err != nil { 804 return curCtx, false, err 805 } 806 desc := stringID(0) 807 if typ == go122.EvSTWBegin { 808 desc = stringID(ev.args[0]) 809 } 810 gState, ok := o.gStates[curCtx.G] 811 if !ok { 812 return curCtx, false, fmt.Errorf("encountered event of type %d without known state for current goroutine %d", typ, curCtx.G) 813 } 814 if err := gState.beginRange(makeRangeType(typ, desc)); err != nil { 815 return curCtx, false, err 816 } 817 return curCtx, true, nil 818 case go122.EvGCMarkAssistActive: 819 gid := GoID(ev.args[0]) 820 // N.B. Like GoStatus, this can happen at any time, because it can 821 // reference a non-running goroutine. Don't check anything about the 822 // current scheduler context. 823 gState, ok := o.gStates[gid] 824 if !ok { 825 return curCtx, false, fmt.Errorf("uninitialized goroutine %d found during %s", gid, go122.EventString(typ)) 826 } 827 if err := gState.activeRange(makeRangeType(typ, 0), gen == o.initialGen); err != nil { 828 return curCtx, false, err 829 } 830 return curCtx, true, nil 831 case go122.EvSTWEnd, go122.EvGCMarkAssistEnd: 832 if err := validateCtx(curCtx, event.UserGoReqs); err != nil { 833 return curCtx, false, err 834 } 835 gState, ok := o.gStates[curCtx.G] 836 if !ok { 837 return curCtx, false, fmt.Errorf("encountered event of type %d without known state for current goroutine %d", typ, curCtx.G) 838 } 839 desc, err := gState.endRange(typ) 840 if err != nil { 841 return curCtx, false, err 842 } 843 if typ == go122.EvSTWEnd { 844 // Smuggle the kind into the event. 845 // Don't use ev.extra here so we have symmetry with STWBegin. 846 ev.args[0] = uint64(desc) 847 } 848 return curCtx, true, nil 849 } 850 return curCtx, false, fmt.Errorf("bad event type found while ordering: %v", ev.typ) 851 } 852 853 // schedCtx represents the scheduling resources associated with an event. 854 type schedCtx struct { 855 G GoID 856 P ProcID 857 M ThreadID 858 } 859 860 // validateCtx ensures that ctx conforms to some reqs, returning an error if 861 // it doesn't. 862 func validateCtx(ctx schedCtx, reqs event.SchedReqs) error { 863 // Check thread requirements. 864 if reqs.Thread == event.MustHave && ctx.M == NoThread { 865 return fmt.Errorf("expected a thread but didn't have one") 866 } else if reqs.Thread == event.MustNotHave && ctx.M != NoThread { 867 return fmt.Errorf("expected no thread but had one") 868 } 869 870 // Check proc requirements. 871 if reqs.Proc == event.MustHave && ctx.P == NoProc { 872 return fmt.Errorf("expected a proc but didn't have one") 873 } else if reqs.Proc == event.MustNotHave && ctx.P != NoProc { 874 return fmt.Errorf("expected no proc but had one") 875 } 876 877 // Check goroutine requirements. 878 if reqs.Goroutine == event.MustHave && ctx.G == NoGoroutine { 879 return fmt.Errorf("expected a goroutine but didn't have one") 880 } else if reqs.Goroutine == event.MustNotHave && ctx.G != NoGoroutine { 881 return fmt.Errorf("expected no goroutine but had one") 882 } 883 return nil 884 } 885 886 // gcState is a trinary variable for the current state of the GC. 887 // 888 // The third state besides "enabled" and "disabled" is "undetermined." 889 type gcState uint8 890 891 const ( 892 gcUndetermined gcState = iota 893 gcNotRunning 894 gcRunning 895 ) 896 897 // String returns a human-readable string for the GC state. 898 func (s gcState) String() string { 899 switch s { 900 case gcUndetermined: 901 return "Undetermined" 902 case gcNotRunning: 903 return "NotRunning" 904 case gcRunning: 905 return "Running" 906 } 907 return "Bad" 908 } 909 910 // userRegion represents a unique user region when attached to some gState. 911 type userRegion struct { 912 // name must be a resolved string because the string ID for the same 913 // string may change across generations, but we care about checking 914 // the value itself. 915 taskID TaskID 916 name string 917 } 918 919 // rangeType is a way to classify special ranges of time. 920 // 921 // These typically correspond 1:1 with "Begin" events, but 922 // they may have an optional subtype that describes the range 923 // in more detail. 924 type rangeType struct { 925 typ event.Type // "Begin" event. 926 desc stringID // Optional subtype. 927 } 928 929 // makeRangeType constructs a new rangeType. 930 func makeRangeType(typ event.Type, desc stringID) rangeType { 931 if styp := go122.Specs()[typ].StartEv; styp != go122.EvNone { 932 typ = styp 933 } 934 return rangeType{typ, desc} 935 } 936 937 // gState is the state of a goroutine at a point in the trace. 938 type gState struct { 939 id GoID 940 status go122.GoStatus 941 seq seqCounter 942 943 // regions are the active user regions for this goroutine. 944 regions []userRegion 945 946 // rangeState is the state of special time ranges bound to this goroutine. 947 rangeState 948 } 949 950 // beginRegion starts a user region on the goroutine. 951 func (s *gState) beginRegion(r userRegion) error { 952 s.regions = append(s.regions, r) 953 return nil 954 } 955 956 // endRegion ends a user region on the goroutine. 957 func (s *gState) endRegion(r userRegion) error { 958 if len(s.regions) == 0 { 959 // We do not know about regions that began before tracing started. 960 return nil 961 } 962 if next := s.regions[len(s.regions)-1]; next != r { 963 return fmt.Errorf("misuse of region in goroutine %v: region end %v when the inner-most active region start event is %v", s.id, r, next) 964 } 965 s.regions = s.regions[:len(s.regions)-1] 966 return nil 967 } 968 969 // pState is the state of a proc at a point in the trace. 970 type pState struct { 971 id ProcID 972 status go122.ProcStatus 973 seq seqCounter 974 975 // rangeState is the state of special time ranges bound to this proc. 976 rangeState 977 } 978 979 // mState is the state of a thread at a point in the trace. 980 type mState struct { 981 g GoID // Goroutine bound to this M. (The goroutine's state is Executing.) 982 p ProcID // Proc bound to this M. (The proc's state is Executing.) 983 } 984 985 // rangeState represents the state of special time ranges. 986 type rangeState struct { 987 // inFlight contains the rangeTypes of any ranges bound to a resource. 988 inFlight []rangeType 989 } 990 991 // beginRange begins a special range in time on the goroutine. 992 // 993 // Returns an error if the range is already in progress. 994 func (s *rangeState) beginRange(typ rangeType) error { 995 if s.hasRange(typ) { 996 return fmt.Errorf("discovered event already in-flight for when starting event %v", go122.Specs()[typ.typ].Name) 997 } 998 s.inFlight = append(s.inFlight, typ) 999 return nil 1000 } 1001 1002 // activeRange marks special range in time on the goroutine as active in the 1003 // initial generation, or confirms that it is indeed active in later generations. 1004 func (s *rangeState) activeRange(typ rangeType, isInitialGen bool) error { 1005 if isInitialGen { 1006 if s.hasRange(typ) { 1007 return fmt.Errorf("found named active range already in first gen: %v", typ) 1008 } 1009 s.inFlight = append(s.inFlight, typ) 1010 } else if !s.hasRange(typ) { 1011 return fmt.Errorf("resource is missing active range: %v %v", go122.Specs()[typ.typ].Name, s.inFlight) 1012 } 1013 return nil 1014 } 1015 1016 // hasRange returns true if a special time range on the goroutine as in progress. 1017 func (s *rangeState) hasRange(typ rangeType) bool { 1018 for _, ftyp := range s.inFlight { 1019 if ftyp == typ { 1020 return true 1021 } 1022 } 1023 return false 1024 } 1025 1026 // endsRange ends a special range in time on the goroutine. 1027 // 1028 // This must line up with the start event type of the range the goroutine is currently in. 1029 func (s *rangeState) endRange(typ event.Type) (stringID, error) { 1030 st := go122.Specs()[typ].StartEv 1031 idx := -1 1032 for i, r := range s.inFlight { 1033 if r.typ == st { 1034 idx = i 1035 break 1036 } 1037 } 1038 if idx < 0 { 1039 return 0, fmt.Errorf("tried to end event %v, but not in-flight", go122.Specs()[st].Name) 1040 } 1041 // Swap remove. 1042 desc := s.inFlight[idx].desc 1043 s.inFlight[idx], s.inFlight[len(s.inFlight)-1] = s.inFlight[len(s.inFlight)-1], s.inFlight[idx] 1044 s.inFlight = s.inFlight[:len(s.inFlight)-1] 1045 return desc, nil 1046 } 1047 1048 // seqCounter represents a global sequence counter for a resource. 1049 type seqCounter struct { 1050 gen uint64 // The generation for the local sequence counter seq. 1051 seq uint64 // The sequence number local to the generation. 1052 } 1053 1054 // makeSeq creates a new seqCounter. 1055 func makeSeq(gen, seq uint64) seqCounter { 1056 return seqCounter{gen: gen, seq: seq} 1057 } 1058 1059 // succeeds returns true if a is the immediate successor of b. 1060 func (a seqCounter) succeeds(b seqCounter) bool { 1061 return a.gen == b.gen && a.seq == b.seq+1 1062 } 1063 1064 // String returns a debug string representation of the seqCounter. 1065 func (c seqCounter) String() string { 1066 return fmt.Sprintf("%d (gen=%d)", c.seq, c.gen) 1067 } 1068 1069 func dumpOrdering(order *ordering) string { 1070 var sb strings.Builder 1071 for id, state := range order.gStates { 1072 fmt.Fprintf(&sb, "G %d [status=%s seq=%s]\n", id, state.status, state.seq) 1073 } 1074 fmt.Fprintln(&sb) 1075 for id, state := range order.pStates { 1076 fmt.Fprintf(&sb, "P %d [status=%s seq=%s]\n", id, state.status, state.seq) 1077 } 1078 fmt.Fprintln(&sb) 1079 for id, state := range order.mStates { 1080 fmt.Fprintf(&sb, "M %d [g=%d p=%d]\n", id, state.g, state.p) 1081 } 1082 fmt.Fprintln(&sb) 1083 fmt.Fprintf(&sb, "GC %d %s\n", order.gcSeq, order.gcState) 1084 return sb.String() 1085 } 1086 1087 // taskState represents an active task. 1088 type taskState struct { 1089 // name is the type of the active task. 1090 name string 1091 1092 // parentID is the parent ID of the active task. 1093 parentID TaskID 1094 }