cuelang.org/go@v0.10.1/internal/golangorgx/telemetry/crashmonitor/monitor.go (about)

     1  // Copyright 2024 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package crashmonitor
     6  
     7  // This file defines a monitor that reports arbitrary Go runtime
     8  // crashes to telemetry.
     9  
    10  import (
    11  	"bytes"
    12  	"fmt"
    13  	"io"
    14  	"log"
    15  	"os"
    16  	"os/exec"
    17  	"reflect"
    18  	"runtime/debug"
    19  	"strconv"
    20  	"strings"
    21  
    22  	"cuelang.org/go/internal/golangorgx/telemetry/internal/counter"
    23  )
    24  
    25  // Supported reports whether the runtime supports [runtime.SetCrashOutput].
    26  //
    27  // TODO(adonovan): eliminate once go1.23+ is assured.
    28  func Supported() bool { return setCrashOutput != nil }
    29  
    30  var setCrashOutput func(*os.File) error // = runtime.SetCrashOutput on go1.23+
    31  
    32  // Start starts the monitor process, which performs automated
    33  // reporting of unexpected crashes via Go telemetry. Call this
    34  // function once immediately after [counter.Open]() within the main
    35  // function of your application, before argument parsing.
    36  //
    37  // This function re-executes the current executable as a child
    38  // process, in a special mode. In that mode, the call to Start will
    39  // never return.
    40  //
    41  // The application should avoid doing expensive work in init functions
    42  // as they will be executed twice. Run with GODEBUG=inittrace=1 to
    43  // display the running time of each package initializer.
    44  //
    45  // Start uses the [debug.SetCrashOutput] mechanism, which is a
    46  // process-wide resource. Do not make other calls to that function
    47  // within your application. Start is a no-op unless the program is
    48  // built with go1.23+.
    49  func Start() {
    50  	if !Supported() {
    51  		return
    52  	}
    53  
    54  	const crashmonitorVar = "X_TELEMETRY_CRASHMONITOR"
    55  	if os.Getenv(crashmonitorVar) != "" {
    56  		// This process is the crashmonitor (child).
    57  		log.SetFlags(0)
    58  		log.SetPrefix("crashmonitor: ")
    59  
    60  		// Wait for parent process's dying gasp.
    61  		// If the parent dies for any reason this read will return.
    62  		data, err := io.ReadAll(os.Stdin)
    63  		if err != nil {
    64  			log.Fatalf("failed to read from input pipe: %v", err)
    65  		}
    66  
    67  		// If the only line is the sentinel, it wasn't a crash.
    68  		if bytes.Count(data, []byte("\n")) < 2 {
    69  			os.Exit(0) // parent exited without crash report
    70  		}
    71  
    72  		log.Printf("parent reported crash:\n%s", data)
    73  
    74  		// Parse the stack out of the crash report
    75  		// and record a telemetry count for it.
    76  		name, err := telemetryCounterName(data)
    77  		if err != nil {
    78  			// Keep count of how often this happens
    79  			// so that we can investigate if necessary.
    80  			incrementCounter("crash/malformed")
    81  
    82  			// Something went wrong.
    83  			// Save the crash securely in the file system.
    84  			f, err := os.CreateTemp(os.TempDir(), "*.crash")
    85  			if err != nil {
    86  				log.Fatal(err)
    87  			}
    88  			if _, err := f.Write(data); err != nil {
    89  				log.Fatal(err)
    90  			}
    91  			if err := f.Close(); err != nil {
    92  				log.Fatal(err)
    93  			}
    94  			log.Printf("failed to report crash to telemetry: %v", err)
    95  			log.Fatalf("crash report saved at %s", f.Name())
    96  		}
    97  
    98  		incrementCounter(name)
    99  
   100  		log.Fatalf("telemetry crash recorded")
   101  	}
   102  
   103  	// This process is the application (parent).
   104  	// Fork+exec the crashmonitor (child).
   105  	exe, err := os.Executable()
   106  	if err != nil {
   107  		log.Fatal(err)
   108  	}
   109  	cmd := exec.Command(exe, "** crashmonitor **") // this unused arg is just for ps(1)
   110  	cmd.Env = append(os.Environ(), crashmonitorVar+"=1")
   111  	cmd.Stderr = os.Stderr
   112  	cmd.Stdout = os.Stderr
   113  	pipe, err := cmd.StdinPipe()
   114  	if err != nil {
   115  		log.Fatalf("StdinPipe: %v", err)
   116  	}
   117  
   118  	writeSentinel(pipe)
   119  	// Ensure that we get pc=0x%x values in the traceback.
   120  	debug.SetTraceback("system")
   121  	setCrashOutput(pipe.(*os.File)) // (this conversion is safe)
   122  
   123  	if err := cmd.Start(); err != nil {
   124  		log.Fatalf("can't start crash monitor: %v", err)
   125  	}
   126  
   127  	// Now return and run the application proper...
   128  }
   129  
   130  // (stubbed by test)
   131  var incrementCounter = func(name string) { counter.New(name).Inc() }
   132  
   133  // The sentinel function returns its address. The difference between
   134  // this value as observed by calls in two different processes of the
   135  // same executable tells us the relative offset of their text segments.
   136  //
   137  // It would be nice if SetCrashOutput took care of this as it's fiddly
   138  // and likely to confuse every user at first.
   139  func sentinel() uint64 {
   140  	return uint64(reflect.ValueOf(sentinel).Pointer())
   141  }
   142  
   143  func writeSentinel(out io.Writer) {
   144  	fmt.Fprintf(out, "sentinel %x\n", sentinel())
   145  }
   146  
   147  // telemetryCounterName parses a crash report produced by the Go
   148  // runtime, extracts the stack of the first runnable goroutine,
   149  // converts each line into telemetry form ("symbol:relative-line"),
   150  // and returns this as the name of a counter.
   151  func telemetryCounterName(crash []byte) (string, error) {
   152  	pcs, err := parseStackPCs(string(crash))
   153  	if err != nil {
   154  		return "", err
   155  	}
   156  
   157  	// Limit the number of frames we request.
   158  	pcs = pcs[:min(len(pcs), 16)]
   159  
   160  	if len(pcs) == 0 {
   161  		// This can occur if all goroutines are idle, as when
   162  		// caught in a deadlock, or killed by an async signal
   163  		// while blocked.
   164  		//
   165  		// TODO(adonovan): consider how to report such
   166  		// situations. Reporting a goroutine in [sleep] or
   167  		// [select] state could be quite confusing without
   168  		// further information about the nature of the crash,
   169  		// as the problem is not local to the code location.
   170  		//
   171  		// For now, we keep count of this situation so that we
   172  		// can access whether it needs a more involved solution.
   173  		return "crash/no-running-goroutine", nil
   174  	}
   175  
   176  	// This string appears at the start of all
   177  	// crashmonitor-generated counter names.
   178  	//
   179  	// It is tempting to expose this as a parameter of Start, but
   180  	// it is not without risk. What value should most programs
   181  	// provide? There's no point giving the name of the executable
   182  	// as this is already recorded by telemetry. What if the
   183  	// application runs in multiple modes? Then it might be useful
   184  	// to record the mode. The problem is that an application with
   185  	// multiple modes probably doesn't know its mode by line 1 of
   186  	// main.main: it might require flag or argument parsing, or
   187  	// even validation of an environment variable, and we really
   188  	// want to steer users aware from any logic before Start. The
   189  	// flags and arguments will be wrong in the child process, and
   190  	// every extra conditional branch creates a risk that the
   191  	// recursively executed child program will behave not like the
   192  	// monitor but like the application. If the child process
   193  	// exits before calling Start, then the parent application
   194  	// will not have a monitor, and its crash reports will be
   195  	// discarded (written in to a pipe that is never read).
   196  	//
   197  	// So for now, we use this constant string.
   198  	const prefix = "crash/crash"
   199  	return counter.EncodeStack(pcs, prefix), nil
   200  }
   201  
   202  // parseStackPCs parses the parent process's program counters for the
   203  // first running goroutine out of a GOTRACEBACK=system traceback,
   204  // adjusting them so that they are valid for the child process's text
   205  // segment.
   206  //
   207  // This function returns only program counter values, ensuring that
   208  // there is no possibility of strings from the crash report (which may
   209  // contain PII) leaking into the telemetry system.
   210  func parseStackPCs(crash string) ([]uintptr, error) {
   211  	// getPC parses the PC out of a line of the form:
   212  	//     \tFILE:LINE +0xRELPC sp=... fp=... pc=...
   213  	getPC := func(line string) (uint64, error) {
   214  		_, pcstr, ok := strings.Cut(line, " pc=") // e.g. pc=0x%x
   215  		if !ok {
   216  			return 0, fmt.Errorf("no pc= for stack frame: %s", line)
   217  		}
   218  		return strconv.ParseUint(pcstr, 0, 64) // 0 => allow 0x prefix
   219  	}
   220  
   221  	var (
   222  		pcs            []uintptr
   223  		parentSentinel uint64
   224  		childSentinel  = sentinel()
   225  		on             = false // are we in the first running goroutine?
   226  		lines          = strings.Split(crash, "\n")
   227  	)
   228  	for i := 0; i < len(lines); i++ {
   229  		line := lines[i]
   230  
   231  		// Read sentinel value.
   232  		if parentSentinel == 0 && strings.HasPrefix(line, "sentinel ") {
   233  			_, err := fmt.Sscanf(line, "sentinel %x", &parentSentinel)
   234  			if err != nil {
   235  				return nil, fmt.Errorf("can't read sentinel line")
   236  			}
   237  			continue
   238  		}
   239  
   240  		// Search for "goroutine GID [STATUS]"
   241  		if !on {
   242  			if strings.HasPrefix(line, "goroutine ") &&
   243  				strings.Contains(line, " [running]:") {
   244  				on = true
   245  
   246  				if parentSentinel == 0 {
   247  					return nil, fmt.Errorf("no sentinel value in crash report")
   248  				}
   249  			}
   250  			continue
   251  		}
   252  
   253  		// A blank line marks end of a goroutine stack.
   254  		if line == "" {
   255  			break
   256  		}
   257  
   258  		// Skip the final "created by SYMBOL in goroutine GID" part.
   259  		if strings.HasPrefix(line, "created by ") {
   260  			break
   261  		}
   262  
   263  		// Expect a pair of lines:
   264  		//   SYMBOL(ARGS)
   265  		//   \tFILE:LINE +0xRELPC sp=0x%x fp=0x%x pc=0x%x
   266  		// Note: SYMBOL may contain parens "pkg.(*T).method"
   267  		// The RELPC is sometimes missing.
   268  
   269  		// Skip the symbol(args) line.
   270  		i++
   271  		if i == len(lines) {
   272  			break
   273  		}
   274  		line = lines[i]
   275  
   276  		// Parse the PC, and correct for the parent and child's
   277  		// different mappings of the text section.
   278  		pc, err := getPC(line)
   279  		if err != nil {
   280  			// Inlined frame, perhaps; skip it.
   281  			continue
   282  		}
   283  		pcs = append(pcs, uintptr(pc-parentSentinel+childSentinel))
   284  	}
   285  	return pcs, nil
   286  }
   287  
   288  func min(x, y int) int {
   289  	if x < y {
   290  		return x
   291  	} else {
   292  		return y
   293  	}
   294  }