cuelang.org/go@v0.10.1/internal/golangorgx/telemetry/crashmonitor/monitor.go (about) 1 // Copyright 2024 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package crashmonitor 6 7 // This file defines a monitor that reports arbitrary Go runtime 8 // crashes to telemetry. 9 10 import ( 11 "bytes" 12 "fmt" 13 "io" 14 "log" 15 "os" 16 "os/exec" 17 "reflect" 18 "runtime/debug" 19 "strconv" 20 "strings" 21 22 "cuelang.org/go/internal/golangorgx/telemetry/internal/counter" 23 ) 24 25 // Supported reports whether the runtime supports [runtime.SetCrashOutput]. 26 // 27 // TODO(adonovan): eliminate once go1.23+ is assured. 28 func Supported() bool { return setCrashOutput != nil } 29 30 var setCrashOutput func(*os.File) error // = runtime.SetCrashOutput on go1.23+ 31 32 // Start starts the monitor process, which performs automated 33 // reporting of unexpected crashes via Go telemetry. Call this 34 // function once immediately after [counter.Open]() within the main 35 // function of your application, before argument parsing. 36 // 37 // This function re-executes the current executable as a child 38 // process, in a special mode. In that mode, the call to Start will 39 // never return. 40 // 41 // The application should avoid doing expensive work in init functions 42 // as they will be executed twice. Run with GODEBUG=inittrace=1 to 43 // display the running time of each package initializer. 44 // 45 // Start uses the [debug.SetCrashOutput] mechanism, which is a 46 // process-wide resource. Do not make other calls to that function 47 // within your application. Start is a no-op unless the program is 48 // built with go1.23+. 49 func Start() { 50 if !Supported() { 51 return 52 } 53 54 const crashmonitorVar = "X_TELEMETRY_CRASHMONITOR" 55 if os.Getenv(crashmonitorVar) != "" { 56 // This process is the crashmonitor (child). 57 log.SetFlags(0) 58 log.SetPrefix("crashmonitor: ") 59 60 // Wait for parent process's dying gasp. 61 // If the parent dies for any reason this read will return. 62 data, err := io.ReadAll(os.Stdin) 63 if err != nil { 64 log.Fatalf("failed to read from input pipe: %v", err) 65 } 66 67 // If the only line is the sentinel, it wasn't a crash. 68 if bytes.Count(data, []byte("\n")) < 2 { 69 os.Exit(0) // parent exited without crash report 70 } 71 72 log.Printf("parent reported crash:\n%s", data) 73 74 // Parse the stack out of the crash report 75 // and record a telemetry count for it. 76 name, err := telemetryCounterName(data) 77 if err != nil { 78 // Keep count of how often this happens 79 // so that we can investigate if necessary. 80 incrementCounter("crash/malformed") 81 82 // Something went wrong. 83 // Save the crash securely in the file system. 84 f, err := os.CreateTemp(os.TempDir(), "*.crash") 85 if err != nil { 86 log.Fatal(err) 87 } 88 if _, err := f.Write(data); err != nil { 89 log.Fatal(err) 90 } 91 if err := f.Close(); err != nil { 92 log.Fatal(err) 93 } 94 log.Printf("failed to report crash to telemetry: %v", err) 95 log.Fatalf("crash report saved at %s", f.Name()) 96 } 97 98 incrementCounter(name) 99 100 log.Fatalf("telemetry crash recorded") 101 } 102 103 // This process is the application (parent). 104 // Fork+exec the crashmonitor (child). 105 exe, err := os.Executable() 106 if err != nil { 107 log.Fatal(err) 108 } 109 cmd := exec.Command(exe, "** crashmonitor **") // this unused arg is just for ps(1) 110 cmd.Env = append(os.Environ(), crashmonitorVar+"=1") 111 cmd.Stderr = os.Stderr 112 cmd.Stdout = os.Stderr 113 pipe, err := cmd.StdinPipe() 114 if err != nil { 115 log.Fatalf("StdinPipe: %v", err) 116 } 117 118 writeSentinel(pipe) 119 // Ensure that we get pc=0x%x values in the traceback. 120 debug.SetTraceback("system") 121 setCrashOutput(pipe.(*os.File)) // (this conversion is safe) 122 123 if err := cmd.Start(); err != nil { 124 log.Fatalf("can't start crash monitor: %v", err) 125 } 126 127 // Now return and run the application proper... 128 } 129 130 // (stubbed by test) 131 var incrementCounter = func(name string) { counter.New(name).Inc() } 132 133 // The sentinel function returns its address. The difference between 134 // this value as observed by calls in two different processes of the 135 // same executable tells us the relative offset of their text segments. 136 // 137 // It would be nice if SetCrashOutput took care of this as it's fiddly 138 // and likely to confuse every user at first. 139 func sentinel() uint64 { 140 return uint64(reflect.ValueOf(sentinel).Pointer()) 141 } 142 143 func writeSentinel(out io.Writer) { 144 fmt.Fprintf(out, "sentinel %x\n", sentinel()) 145 } 146 147 // telemetryCounterName parses a crash report produced by the Go 148 // runtime, extracts the stack of the first runnable goroutine, 149 // converts each line into telemetry form ("symbol:relative-line"), 150 // and returns this as the name of a counter. 151 func telemetryCounterName(crash []byte) (string, error) { 152 pcs, err := parseStackPCs(string(crash)) 153 if err != nil { 154 return "", err 155 } 156 157 // Limit the number of frames we request. 158 pcs = pcs[:min(len(pcs), 16)] 159 160 if len(pcs) == 0 { 161 // This can occur if all goroutines are idle, as when 162 // caught in a deadlock, or killed by an async signal 163 // while blocked. 164 // 165 // TODO(adonovan): consider how to report such 166 // situations. Reporting a goroutine in [sleep] or 167 // [select] state could be quite confusing without 168 // further information about the nature of the crash, 169 // as the problem is not local to the code location. 170 // 171 // For now, we keep count of this situation so that we 172 // can access whether it needs a more involved solution. 173 return "crash/no-running-goroutine", nil 174 } 175 176 // This string appears at the start of all 177 // crashmonitor-generated counter names. 178 // 179 // It is tempting to expose this as a parameter of Start, but 180 // it is not without risk. What value should most programs 181 // provide? There's no point giving the name of the executable 182 // as this is already recorded by telemetry. What if the 183 // application runs in multiple modes? Then it might be useful 184 // to record the mode. The problem is that an application with 185 // multiple modes probably doesn't know its mode by line 1 of 186 // main.main: it might require flag or argument parsing, or 187 // even validation of an environment variable, and we really 188 // want to steer users aware from any logic before Start. The 189 // flags and arguments will be wrong in the child process, and 190 // every extra conditional branch creates a risk that the 191 // recursively executed child program will behave not like the 192 // monitor but like the application. If the child process 193 // exits before calling Start, then the parent application 194 // will not have a monitor, and its crash reports will be 195 // discarded (written in to a pipe that is never read). 196 // 197 // So for now, we use this constant string. 198 const prefix = "crash/crash" 199 return counter.EncodeStack(pcs, prefix), nil 200 } 201 202 // parseStackPCs parses the parent process's program counters for the 203 // first running goroutine out of a GOTRACEBACK=system traceback, 204 // adjusting them so that they are valid for the child process's text 205 // segment. 206 // 207 // This function returns only program counter values, ensuring that 208 // there is no possibility of strings from the crash report (which may 209 // contain PII) leaking into the telemetry system. 210 func parseStackPCs(crash string) ([]uintptr, error) { 211 // getPC parses the PC out of a line of the form: 212 // \tFILE:LINE +0xRELPC sp=... fp=... pc=... 213 getPC := func(line string) (uint64, error) { 214 _, pcstr, ok := strings.Cut(line, " pc=") // e.g. pc=0x%x 215 if !ok { 216 return 0, fmt.Errorf("no pc= for stack frame: %s", line) 217 } 218 return strconv.ParseUint(pcstr, 0, 64) // 0 => allow 0x prefix 219 } 220 221 var ( 222 pcs []uintptr 223 parentSentinel uint64 224 childSentinel = sentinel() 225 on = false // are we in the first running goroutine? 226 lines = strings.Split(crash, "\n") 227 ) 228 for i := 0; i < len(lines); i++ { 229 line := lines[i] 230 231 // Read sentinel value. 232 if parentSentinel == 0 && strings.HasPrefix(line, "sentinel ") { 233 _, err := fmt.Sscanf(line, "sentinel %x", &parentSentinel) 234 if err != nil { 235 return nil, fmt.Errorf("can't read sentinel line") 236 } 237 continue 238 } 239 240 // Search for "goroutine GID [STATUS]" 241 if !on { 242 if strings.HasPrefix(line, "goroutine ") && 243 strings.Contains(line, " [running]:") { 244 on = true 245 246 if parentSentinel == 0 { 247 return nil, fmt.Errorf("no sentinel value in crash report") 248 } 249 } 250 continue 251 } 252 253 // A blank line marks end of a goroutine stack. 254 if line == "" { 255 break 256 } 257 258 // Skip the final "created by SYMBOL in goroutine GID" part. 259 if strings.HasPrefix(line, "created by ") { 260 break 261 } 262 263 // Expect a pair of lines: 264 // SYMBOL(ARGS) 265 // \tFILE:LINE +0xRELPC sp=0x%x fp=0x%x pc=0x%x 266 // Note: SYMBOL may contain parens "pkg.(*T).method" 267 // The RELPC is sometimes missing. 268 269 // Skip the symbol(args) line. 270 i++ 271 if i == len(lines) { 272 break 273 } 274 line = lines[i] 275 276 // Parse the PC, and correct for the parent and child's 277 // different mappings of the text section. 278 pc, err := getPC(line) 279 if err != nil { 280 // Inlined frame, perhaps; skip it. 281 continue 282 } 283 pcs = append(pcs, uintptr(pc-parentSentinel+childSentinel)) 284 } 285 return pcs, nil 286 } 287 288 func min(x, y int) int { 289 if x < y { 290 return x 291 } else { 292 return y 293 } 294 }