github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/exec/tracer.go (about) 1 // Copyright 2019 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache 2.0 3 // license that can be found in the LICENSE file. 4 5 package exec 6 7 import ( 8 "fmt" 9 "io" 10 "strings" 11 "sync" 12 "time" 13 14 "github.com/grailbio/bigslice/internal/trace" 15 ) 16 17 // A tracer tracks a set of trace events associated with objects in 18 // Bigslice. Trace events are logged in the Chrome tracing format and 19 // can be visualized using its built-in visualization tool 20 // (chrome://tracing). Each machine is represented as a Chrome "process", 21 // and individual task or invocation events are tracked by the machine 22 // they are run on. 23 // 24 // To produce easier to interpret visualizations, tracer assigns generated 25 // virtual "thread IDs" to trace events, and events are also coalesced into 26 // "complete events" (X) at the time of rendering. 27 // 28 // TODO(marius): garbage collection of old events. 29 type tracer struct { 30 mu sync.Mutex 31 32 events []trace.Event 33 taskEvents map[*Task][]trace.Event 34 compileEvents map[compileKey][]trace.Event 35 36 machinePids map[*sliceMachine]int 37 machineTidPools map[*sliceMachine]*tidPool 38 39 // firstEvent is used to store the time of the first observed 40 // event so that the offsets in the trace are meaningful. 41 firstEvent time.Time 42 } 43 44 // tidPool is a pool of (virtual) thread IDs that we use to assign Tids to 45 // events. This makes visualization with the Chrome tracing tool much nicer, as 46 // concurrent events are shown on their own rows. The length of the pool is the 47 // maximum number of B events without a matching E event. The indexes of the 48 // slices are the Tids that we allocate, their corresponding value indicating 49 // whether it is considered available for allocation. 50 type tidPool []bool 51 52 // compileKey is the key used for compilation events, which are scoped to a 53 // (machine, invocation). 54 type compileKey struct { 55 addr string 56 inv uint64 57 } 58 59 func newTracer() *tracer { 60 return &tracer{ 61 taskEvents: make(map[*Task][]trace.Event), 62 compileEvents: make(map[compileKey][]trace.Event), 63 machinePids: make(map[*sliceMachine]int), 64 machineTidPools: make(map[*sliceMachine]*tidPool), 65 } 66 } 67 68 // Event logs an event on the provided machine with the given 69 // subject, type (ph), and arguments. The event's subject must be 70 // either a *Task or a bigslice.Invocation; ph is as in Chrome's 71 // tracing format. Arguments is list of interleaved key-value pairs 72 // that are attached as event metadata. Args must be of even length. 73 // 74 // If mach is nil, the event is assigned to the evaluator. 75 func (t *tracer) Event(mach *sliceMachine, subject interface{}, ph string, args ...interface{}) { 76 if t == nil { 77 return 78 } 79 if len(args)%2 != 0 { 80 panic("trace.Event: invalid arguments") 81 } 82 var event trace.Event 83 event.Args = make(map[string]interface{}, len(args)/2) 84 for i := 0; i < len(args); i += 2 { 85 event.Args[fmt.Sprint(args[i])] = args[i+1] 86 } 87 event.Ph = ph 88 t.mu.Lock() 89 defer t.mu.Unlock() 90 if t.firstEvent.IsZero() { 91 t.firstEvent = time.Now() 92 event.Ts = 0 93 } else { 94 event.Ts = time.Since(t.firstEvent).Nanoseconds() / 1e3 95 } 96 if mach != nil { 97 pid, ok := t.machinePids[mach] 98 if !ok { 99 pid = len(t.machinePids) + 1 // pid=0 is reserved for evaluator events 100 t.machinePids[mach] = pid 101 // Attach "process" name metadata so we can identify where a task is running. 102 t.events = append(t.events, trace.Event{ 103 Pid: pid, 104 Ts: event.Ts, 105 Ph: "M", 106 Name: "process_name", 107 Args: map[string]interface{}{ 108 "name": mach.Addr, 109 }, 110 }) 111 } 112 event.Pid = pid 113 } 114 switch arg := subject.(type) { 115 case *Task: 116 event.Name = arg.Name.String() 117 event.Cat = "task" 118 t.assignTid(mach, ph, t.taskEvents[arg], &event) 119 t.taskEvents[arg] = append(t.taskEvents[arg], event) 120 case execInvocation: 121 var name strings.Builder 122 fmt.Fprint(&name, arg.Index) 123 if arg.Exclusive { 124 name.WriteString("[x]") 125 } 126 event.Name = name.String() 127 event.Cat = "invocation" 128 key := compileKey{mach.Addr, arg.Index} 129 t.assignTid(mach, ph, t.compileEvents[key], &event) 130 t.compileEvents[key] = append(t.compileEvents[key], event) 131 default: 132 panic(fmt.Sprintf("unsupported subject type %T", subject)) 133 } 134 } 135 136 // assignTid assigns a thread ID to event, using mach's tid pool and type of 137 // event. events is the slices of existing relevant events, e.g. 138 // t.taskEvents[arg]. 139 func (t *tracer) assignTid(mach *sliceMachine, ph string, events []trace.Event, event *trace.Event) { 140 event.Tid = 0 141 pool, ok := t.machineTidPools[mach] 142 if !ok { 143 pool = new(tidPool) 144 t.machineTidPools[mach] = pool 145 } 146 switch ph { 147 case "B": 148 event.Tid = pool.Acquire() 149 case "E": 150 if len(events) == 0 { 151 break 152 } 153 lastEvent := events[len(events)-1] 154 if lastEvent.Ph != "B" { 155 break 156 } 157 event.Tid = lastEvent.Tid 158 pool.Release(event.Tid) 159 } 160 } 161 162 // Marshal writes the trace captured by t into the writer w in 163 // Chrome's event tracing format. 164 func (t *tracer) Marshal(w io.Writer) error { 165 t.mu.Lock() 166 events := make([]trace.Event, len(t.events)) 167 copy(events, t.events) 168 for _, v := range t.taskEvents { 169 events = appendCoalesce(events, v, t.firstEvent) 170 } 171 for _, v := range t.compileEvents { 172 events = appendCoalesce(events, v, t.firstEvent) 173 } 174 t.mu.Unlock() 175 176 trace := trace.T{Events: events} 177 return trace.Encode(w) 178 } 179 180 // appendCoalesce appends a set of events on the provided list, 181 // first coalescing events so that "B" and "E" events are matched 182 // into a single "X" event. This produces more visually compact (and 183 // useful) trace visualizations. appendCoalesce also prunes orphan 184 // events. 185 func appendCoalesce(list []trace.Event, events []trace.Event, firstEvent time.Time) []trace.Event { 186 var begIndex = -1 187 for _, event := range events { 188 if event.Ph == "B" && begIndex < 0 { 189 begIndex = len(list) 190 } 191 if event.Ph == "E" && begIndex >= 0 { 192 list[begIndex].Ph = "X" 193 list[begIndex].Dur = event.Ts - list[begIndex].Ts 194 if list[begIndex].Dur == 0 { 195 list[begIndex].Dur = 1 196 } 197 for k, v := range event.Args { 198 if _, ok := list[begIndex].Args[k]; !ok { 199 list[begIndex].Args[k] = v 200 } 201 } 202 // Reset the begin index, so that if a task fails but is retried 203 // on the same machine; then these are captured as two unique 204 // events. 205 begIndex = -1 206 } else if event.Ph != "E" { 207 list = append(list, event) 208 } // drop unmatched "E"s 209 } 210 if begIndex >= 0 { 211 // We have an unmatched "B". Drop it. 212 copy(list[begIndex:], list[begIndex+1:]) 213 list = list[:len(list)-1] 214 } 215 return list 216 } 217 218 // Acquire acquires an available thread ID from pool p. Thread IDs are 219 // sequential and 1-indexed, preserving 0 for events without meaningful thread 220 // IDs. 221 func (p *tidPool) Acquire() int { 222 for tid, available := range *p { 223 if available { 224 (*p)[tid] = false 225 return tid + 1 226 } 227 } 228 // Nothing available in the pool, so grow it. 229 tid := len(*p) 230 *p = append(*p, false) 231 return tid + 1 232 } 233 234 // Release releases a tid, a thread ID previously acquired in Acquire. This 235 // makes it available to be returned from a future call to Acquire. 236 func (p *tidPool) Release(tid int) { 237 if (*p)[tid-1] { 238 panic("releasing unallocated tid") 239 240 } 241 (*p)[tid-1] = true 242 }