github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/cmd/slicetrace/session.go (about) 1 // Copyright 2020 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache 2.0 3 // license that can be found in the LICENSE file. 4 5 package main 6 7 import ( 8 "fmt" 9 "log" 10 "regexp" 11 "sort" 12 "strconv" 13 "strings" 14 "time" 15 16 "github.com/grailbio/base/limitbuf" 17 "github.com/grailbio/bigslice/internal/trace" 18 ) 19 20 // invocation represents an invocation of a bigslice Func. 21 type invocation struct { 22 index int 23 location string 24 args string 25 } 26 27 // task represents statistics from the execution of a single task. 28 type task struct { 29 invIndex int 30 op string 31 shards int 32 shard int 33 // start is measured as a duration offset from the start of tracing. 34 start time.Duration 35 duration time.Duration 36 read time.Duration 37 write time.Duration 38 } 39 40 // opStat represents useful statistics for a single op of a particular 41 // invocation. 42 type opStat struct { 43 invIndex int 44 op string 45 shards int 46 // start is measured as a duration offset from the start of tracing. 47 start time.Duration 48 duration time.Duration 49 total time.Duration 50 read time.Duration 51 write time.Duration 52 min time.Duration 53 q1 time.Duration 54 q2 time.Duration 55 q3 time.Duration 56 max time.Duration 57 } 58 59 // session represents the trace events from a bigslice session, interpreted for 60 // display of useful diagnostics. 61 type session struct { 62 invs []invocation 63 tasks []task 64 opStats []opStat 65 } 66 67 // reInv is used to match the event name of "invocation" category events. 68 // The name is the invocation index, e.g. "4". If the invocation was exclusive, 69 // the names has "[x]" appended, e.g. "4[x]". 70 var reInv = regexp.MustCompile(`^(\d+)`) 71 72 // reTask is used to match the event name of "task" category events. The name is 73 // the full task name, from which we parse the invocation index, op name, shard 74 // number, and total number of shards. For example, the task name 75 // "inv2_reader_map@2000:132" will be parsed as invocation 2, op "reader_map", 76 // shards 2000, shard 132. 77 var reTask = regexp.MustCompile(`inv(\d+)_([^@]*)@(\d+):(\d+)`) 78 79 func newSession(events []trace.Event) *session { 80 tasks := buildTasks(events) 81 return &session{ 82 invs: buildInvs(events), 83 tasks: tasks, 84 opStats: buildOpStats(tasks), 85 } 86 } 87 88 // Invs returns all of the invocations from the session s. 89 func (s *session) Invs() []invocation { 90 return s.invs 91 } 92 93 // OpStats returns all of the op statistics for the invocation indicated by 94 // invIndex. 95 func (s *session) OpStats(invIndex int) []opStat { 96 var opStats []opStat 97 for _, opStat := range s.opStats { 98 if opStat.invIndex != invIndex { 99 continue 100 } 101 opStats = append(opStats, opStat) 102 } 103 sort.Slice(opStats, func(i, j int) bool { 104 return opStats[i].start < opStats[j].start 105 }) 106 return opStats 107 } 108 109 func buildInvs(events []trace.Event) []invocation { 110 invByIndex := make(map[int]invocation) 111 for _, event := range events { 112 if event.Cat != "invocation" { 113 continue 114 } 115 matches := reInv.FindStringSubmatch(event.Name) 116 if matches == nil { 117 log.Printf("could not parse name: %#v", event) 118 continue 119 } 120 index, err := strconv.Atoi(matches[1]) 121 if err != nil { 122 log.Printf("could not parse invocation index from name: %s", event.Name) 123 continue 124 } 125 if _, ok := invByIndex[index]; ok { 126 // We just assume that the invocations across machines are all the 127 // same, so we just take the first one, and ignore the rest. 128 continue 129 } 130 var args []string 131 for _, invArg := range event.Args["args"].([]interface{}) { 132 args = append(args, fmt.Sprintf("\"%s\"", invArg.(string))) 133 } 134 invByIndex[index] = invocation{ 135 index: index, 136 location: event.Args["location"].(string), 137 args: truncatef(strings.Join(args, ", ")), 138 } 139 } 140 invs := make([]invocation, 0, len(invByIndex)) 141 for _, inv := range invByIndex { 142 invs = append(invs, inv) 143 } 144 sort.Slice(invs, func(i, j int) bool { return invs[i].index < invs[j].index }) 145 return invs 146 } 147 148 func buildTasks(events []trace.Event) []task { 149 var tasks []task 150 for _, event := range events { 151 if event.Cat != "task" { 152 continue 153 } 154 matches := reTask.FindStringSubmatch(event.Name) 155 if matches == nil { 156 log.Printf("could not parse name: %#v", event) 157 continue 158 } 159 invIndex, err := strconv.Atoi(matches[1]) 160 if err != nil { 161 log.Printf("could not parse invocation index from name: %s", event.Name) 162 continue 163 } 164 op := matches[2] 165 shards, err := strconv.Atoi(matches[3]) 166 if err != nil { 167 log.Printf("could not parse shards from name: %s", event.Name) 168 continue 169 } 170 shard, err := strconv.Atoi(matches[4]) 171 if err != nil { 172 log.Printf("could not parse shard from name: %s", event.Name) 173 continue 174 } 175 readDuration := int64(event.Args["readDuration"].(float64)) 176 writeDuration := int64(event.Args["writeDuration"].(float64)) 177 tasks = append(tasks, task{ 178 invIndex: invIndex, 179 op: op, 180 shards: shards, 181 shard: shard, 182 start: time.Duration(event.Ts * 1e3), 183 duration: time.Duration(event.Dur * 1e3), 184 read: time.Duration(readDuration * 1e3), 185 write: time.Duration(writeDuration * 1e3), 186 }) 187 } 188 return tasks 189 } 190 191 func buildOpStats(tasks []task) []opStat { 192 type invOp struct { 193 invIndex int 194 op string 195 } 196 type accum struct { 197 shards int 198 minStart time.Duration 199 maxEnd time.Duration 200 durations []time.Duration 201 total time.Duration 202 read time.Duration 203 write time.Duration 204 } 205 accums := make(map[invOp]*accum) 206 for _, task := range tasks { 207 key := invOp{task.invIndex, task.op} 208 a, ok := accums[key] 209 if !ok { 210 a = &accum{ 211 minStart: 1<<63 - 1, 212 } 213 accums[key] = a 214 // We expect all tasks to have the same total number of shards, so 215 // we set it on the first task we see and then verify for all 216 // subsequent tasks. 217 a.shards = task.shards 218 } 219 if task.start < a.minStart { 220 a.minStart = task.start 221 } 222 end := task.start + task.duration 223 if a.maxEnd < end { 224 a.maxEnd = end 225 } 226 if task.shards != a.shards { 227 log.Fatalf("different total number of shards: got %d, want %d", task.shards, a.shards) 228 } 229 a.durations = append(a.durations, task.duration) 230 a.total += task.duration 231 a.read += task.read 232 a.write += task.write 233 } 234 opStats := make([]opStat, 0, len(accums)) 235 for invOp, a := range accums { 236 sort.Slice(a.durations, func(i, j int) bool { 237 return a.durations[i] < a.durations[j] 238 }) 239 // It is safe to call computeQuartiles, because a.durations is non-empty 240 // by construction. a will only exist if there was a task, and if there 241 // was a task, its duration was added to a.durations. 242 q1, q2, q3 := computeQuartiles(a.durations) 243 opStats = append(opStats, opStat{ 244 invIndex: invOp.invIndex, 245 op: invOp.op, 246 shards: a.shards, 247 start: a.minStart, 248 duration: a.maxEnd - a.minStart, 249 total: a.total, 250 read: a.read, 251 write: a.write, 252 min: a.durations[0], 253 q1: q1, 254 q2: q2, 255 q3: q3, 256 max: a.durations[len(a.durations)-1], 257 }) 258 } 259 return opStats 260 } 261 262 func truncatef(v interface{}) string { 263 b := limitbuf.NewLogger(80) 264 fmt.Fprint(b, v) 265 return b.String() 266 }