github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/cmd/slicetrace/session.go (about)

     1  // Copyright 2020 GRAIL, Inc. All rights reserved.
     2  // Use of this source code is governed by the Apache 2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  package main
     6  
     7  import (
     8  	"fmt"
     9  	"log"
    10  	"regexp"
    11  	"sort"
    12  	"strconv"
    13  	"strings"
    14  	"time"
    15  
    16  	"github.com/grailbio/base/limitbuf"
    17  	"github.com/grailbio/bigslice/internal/trace"
    18  )
    19  
    20  // invocation represents an invocation of a bigslice Func.
    21  type invocation struct {
    22  	index    int
    23  	location string
    24  	args     string
    25  }
    26  
    27  // task represents statistics from the execution of a single task.
    28  type task struct {
    29  	invIndex int
    30  	op       string
    31  	shards   int
    32  	shard    int
    33  	// start is measured as a duration offset from the start of tracing.
    34  	start    time.Duration
    35  	duration time.Duration
    36  	read     time.Duration
    37  	write    time.Duration
    38  }
    39  
    40  // opStat represents useful statistics for a single op of a particular
    41  // invocation.
    42  type opStat struct {
    43  	invIndex int
    44  	op       string
    45  	shards   int
    46  	// start is measured as a duration offset from the start of tracing.
    47  	start    time.Duration
    48  	duration time.Duration
    49  	total    time.Duration
    50  	read     time.Duration
    51  	write    time.Duration
    52  	min      time.Duration
    53  	q1       time.Duration
    54  	q2       time.Duration
    55  	q3       time.Duration
    56  	max      time.Duration
    57  }
    58  
    59  // session represents the trace events from a bigslice session, interpreted for
    60  // display of useful diagnostics.
    61  type session struct {
    62  	invs    []invocation
    63  	tasks   []task
    64  	opStats []opStat
    65  }
    66  
    67  // reInv is used to match the event name of "invocation" category events.
    68  // The name is the invocation index, e.g. "4". If the invocation was exclusive,
    69  // the names has "[x]" appended, e.g. "4[x]".
    70  var reInv = regexp.MustCompile(`^(\d+)`)
    71  
    72  // reTask is used to match the event name of "task" category events. The name is
    73  // the full task name, from which we parse the invocation index, op name, shard
    74  // number, and total number of shards. For example, the task name
    75  // "inv2_reader_map@2000:132" will be parsed as invocation 2, op "reader_map",
    76  // shards 2000, shard 132.
    77  var reTask = regexp.MustCompile(`inv(\d+)_([^@]*)@(\d+):(\d+)`)
    78  
    79  func newSession(events []trace.Event) *session {
    80  	tasks := buildTasks(events)
    81  	return &session{
    82  		invs:    buildInvs(events),
    83  		tasks:   tasks,
    84  		opStats: buildOpStats(tasks),
    85  	}
    86  }
    87  
    88  // Invs returns all of the invocations from the session s.
    89  func (s *session) Invs() []invocation {
    90  	return s.invs
    91  }
    92  
    93  // OpStats returns all of the op statistics for the invocation indicated by
    94  // invIndex.
    95  func (s *session) OpStats(invIndex int) []opStat {
    96  	var opStats []opStat
    97  	for _, opStat := range s.opStats {
    98  		if opStat.invIndex != invIndex {
    99  			continue
   100  		}
   101  		opStats = append(opStats, opStat)
   102  	}
   103  	sort.Slice(opStats, func(i, j int) bool {
   104  		return opStats[i].start < opStats[j].start
   105  	})
   106  	return opStats
   107  }
   108  
   109  func buildInvs(events []trace.Event) []invocation {
   110  	invByIndex := make(map[int]invocation)
   111  	for _, event := range events {
   112  		if event.Cat != "invocation" {
   113  			continue
   114  		}
   115  		matches := reInv.FindStringSubmatch(event.Name)
   116  		if matches == nil {
   117  			log.Printf("could not parse name: %#v", event)
   118  			continue
   119  		}
   120  		index, err := strconv.Atoi(matches[1])
   121  		if err != nil {
   122  			log.Printf("could not parse invocation index from name: %s", event.Name)
   123  			continue
   124  		}
   125  		if _, ok := invByIndex[index]; ok {
   126  			// We just assume that the invocations across machines are all the
   127  			// same, so we just take the first one, and ignore the rest.
   128  			continue
   129  		}
   130  		var args []string
   131  		for _, invArg := range event.Args["args"].([]interface{}) {
   132  			args = append(args, fmt.Sprintf("\"%s\"", invArg.(string)))
   133  		}
   134  		invByIndex[index] = invocation{
   135  			index:    index,
   136  			location: event.Args["location"].(string),
   137  			args:     truncatef(strings.Join(args, ", ")),
   138  		}
   139  	}
   140  	invs := make([]invocation, 0, len(invByIndex))
   141  	for _, inv := range invByIndex {
   142  		invs = append(invs, inv)
   143  	}
   144  	sort.Slice(invs, func(i, j int) bool { return invs[i].index < invs[j].index })
   145  	return invs
   146  }
   147  
   148  func buildTasks(events []trace.Event) []task {
   149  	var tasks []task
   150  	for _, event := range events {
   151  		if event.Cat != "task" {
   152  			continue
   153  		}
   154  		matches := reTask.FindStringSubmatch(event.Name)
   155  		if matches == nil {
   156  			log.Printf("could not parse name: %#v", event)
   157  			continue
   158  		}
   159  		invIndex, err := strconv.Atoi(matches[1])
   160  		if err != nil {
   161  			log.Printf("could not parse invocation index from name: %s", event.Name)
   162  			continue
   163  		}
   164  		op := matches[2]
   165  		shards, err := strconv.Atoi(matches[3])
   166  		if err != nil {
   167  			log.Printf("could not parse shards from name: %s", event.Name)
   168  			continue
   169  		}
   170  		shard, err := strconv.Atoi(matches[4])
   171  		if err != nil {
   172  			log.Printf("could not parse shard from name: %s", event.Name)
   173  			continue
   174  		}
   175  		readDuration := int64(event.Args["readDuration"].(float64))
   176  		writeDuration := int64(event.Args["writeDuration"].(float64))
   177  		tasks = append(tasks, task{
   178  			invIndex: invIndex,
   179  			op:       op,
   180  			shards:   shards,
   181  			shard:    shard,
   182  			start:    time.Duration(event.Ts * 1e3),
   183  			duration: time.Duration(event.Dur * 1e3),
   184  			read:     time.Duration(readDuration * 1e3),
   185  			write:    time.Duration(writeDuration * 1e3),
   186  		})
   187  	}
   188  	return tasks
   189  }
   190  
   191  func buildOpStats(tasks []task) []opStat {
   192  	type invOp struct {
   193  		invIndex int
   194  		op       string
   195  	}
   196  	type accum struct {
   197  		shards    int
   198  		minStart  time.Duration
   199  		maxEnd    time.Duration
   200  		durations []time.Duration
   201  		total     time.Duration
   202  		read      time.Duration
   203  		write     time.Duration
   204  	}
   205  	accums := make(map[invOp]*accum)
   206  	for _, task := range tasks {
   207  		key := invOp{task.invIndex, task.op}
   208  		a, ok := accums[key]
   209  		if !ok {
   210  			a = &accum{
   211  				minStart: 1<<63 - 1,
   212  			}
   213  			accums[key] = a
   214  			// We expect all tasks to have the same total number of shards, so
   215  			// we set it on the first task we see and then verify for all
   216  			// subsequent tasks.
   217  			a.shards = task.shards
   218  		}
   219  		if task.start < a.minStart {
   220  			a.minStart = task.start
   221  		}
   222  		end := task.start + task.duration
   223  		if a.maxEnd < end {
   224  			a.maxEnd = end
   225  		}
   226  		if task.shards != a.shards {
   227  			log.Fatalf("different total number of shards: got %d, want %d", task.shards, a.shards)
   228  		}
   229  		a.durations = append(a.durations, task.duration)
   230  		a.total += task.duration
   231  		a.read += task.read
   232  		a.write += task.write
   233  	}
   234  	opStats := make([]opStat, 0, len(accums))
   235  	for invOp, a := range accums {
   236  		sort.Slice(a.durations, func(i, j int) bool {
   237  			return a.durations[i] < a.durations[j]
   238  		})
   239  		// It is safe to call computeQuartiles, because a.durations is non-empty
   240  		// by construction. a will only exist if there was a task, and if there
   241  		// was a task, its duration was added to a.durations.
   242  		q1, q2, q3 := computeQuartiles(a.durations)
   243  		opStats = append(opStats, opStat{
   244  			invIndex: invOp.invIndex,
   245  			op:       invOp.op,
   246  			shards:   a.shards,
   247  			start:    a.minStart,
   248  			duration: a.maxEnd - a.minStart,
   249  			total:    a.total,
   250  			read:     a.read,
   251  			write:    a.write,
   252  			min:      a.durations[0],
   253  			q1:       q1,
   254  			q2:       q2,
   255  			q3:       q3,
   256  			max:      a.durations[len(a.durations)-1],
   257  		})
   258  	}
   259  	return opStats
   260  }
   261  
   262  func truncatef(v interface{}) string {
   263  	b := limitbuf.NewLogger(80)
   264  	fmt.Fprint(b, v)
   265  	return b.String()
   266  }