github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/command/operator_debug.go (about) 1 package command 2 3 import ( 4 "archive/tar" 5 "compress/gzip" 6 "context" 7 "crypto/tls" 8 "encoding/json" 9 "fmt" 10 "html/template" 11 "io" 12 "io/ioutil" 13 "net/http" 14 "os" 15 "os/signal" 16 "path/filepath" 17 "strconv" 18 "strings" 19 "syscall" 20 "time" 21 22 "github.com/hashicorp/go-cleanhttp" 23 "github.com/hashicorp/nomad/api" 24 "github.com/hashicorp/nomad/helper" 25 "github.com/hashicorp/nomad/nomad/structs" 26 "github.com/posener/complete" 27 ) 28 29 type OperatorDebugCommand struct { 30 Meta 31 32 timestamp string 33 collectDir string 34 duration time.Duration 35 interval time.Duration 36 pprofDuration time.Duration 37 logLevel string 38 stale bool 39 maxNodes int 40 nodeClass string 41 nodeIDs []string 42 serverIDs []string 43 consul *external 44 vault *external 45 manifest []string 46 ctx context.Context 47 cancel context.CancelFunc 48 } 49 50 const ( 51 userAgent = "nomad operator debug" 52 ) 53 54 func (c *OperatorDebugCommand) Help() string { 55 helpText := ` 56 Usage: nomad operator debug [options] 57 58 Build an archive containing Nomad cluster configuration and state, and Consul and Vault 59 status. Include logs and pprof profiles for selected servers and client nodes. 60 61 If ACLs are enabled, this command will require a token with the 'node:read' 62 capability to run. In order to collect information, the token will also 63 require the 'agent:read' and 'operator:read' capabilities, as well as the 64 'list-jobs' capability for all namespaces. To collect pprof profiles the 65 token will also require 'agent:write', or enable_debug configuration set to true. 66 67 General Options: 68 69 ` + generalOptionsUsage(usageOptsDefault|usageOptsNoNamespace) + ` 70 71 Debug Options: 72 73 -duration=<duration> 74 The duration of the log monitor command. Defaults to 2m. 75 76 -interval=<interval> 77 The interval between snapshots of the Nomad state. If unspecified, only one snapshot is 78 captured. 79 80 -log-level=<level> 81 The log level to monitor. Defaults to DEBUG. 82 83 -max-nodes=<count> 84 Cap the maximum number of client nodes included in the capture. Defaults to 10, set to 0 for unlimited. 85 86 -node-id=<node>,<node> 87 Comma separated list of Nomad client node ids, to monitor for logs and include pprof 88 profiles. Accepts id prefixes, and "all" to select all nodes (up to count = max-nodes). 89 90 -node-class=<node-class> 91 Filter client nodes based on node class. 92 93 -pprof-duration=<duration> 94 Duration for pprof collection. Defaults to 1s. 95 96 -server-id=<server>,<server> 97 Comma separated list of Nomad server names, "leader", or "all" to monitor for logs and include pprof 98 profiles. 99 100 -stale=<true|false> 101 If "false", the default, get membership data from the cluster leader. If the cluster is in 102 an outage unable to establish leadership, it may be necessary to get the configuration from 103 a non-leader server. 104 105 -output=<path> 106 Path to the parent directory of the output directory. If not specified, an archive is built 107 in the current directory. 108 109 -consul-http-addr=<addr> 110 The address and port of the Consul HTTP agent. Overrides the CONSUL_HTTP_ADDR environment variable. 111 112 -consul-token=<token> 113 Token used to query Consul. Overrides the CONSUL_HTTP_TOKEN environment 114 variable and the Consul token file. 115 116 -consul-token-file=<path> 117 Path to the Consul token file. Overrides the CONSUL_HTTP_TOKEN_FILE 118 environment variable. 119 120 -consul-client-cert=<path> 121 Path to the Consul client cert file. Overrides the CONSUL_CLIENT_CERT 122 environment variable. 123 124 -consul-client-key=<path> 125 Path to the Consul client key file. Overrides the CONSUL_CLIENT_KEY 126 environment variable. 127 128 -consul-ca-cert=<path> 129 Path to a CA file to use with Consul. Overrides the CONSUL_CACERT 130 environment variable and the Consul CA path. 131 132 -consul-ca-path=<path> 133 Path to a directory of PEM encoded CA cert files to verify the Consul 134 certificate. Overrides the CONSUL_CAPATH environment variable. 135 136 -vault-address=<addr> 137 The address and port of the Vault HTTP agent. Overrides the VAULT_ADDR 138 environment variable. 139 140 -vault-token=<token> 141 Token used to query Vault. Overrides the VAULT_TOKEN environment 142 variable. 143 144 -vault-client-cert=<path> 145 Path to the Vault client cert file. Overrides the VAULT_CLIENT_CERT 146 environment variable. 147 148 -vault-client-key=<path> 149 Path to the Vault client key file. Overrides the VAULT_CLIENT_KEY 150 environment variable. 151 152 -vault-ca-cert=<path> 153 Path to a CA file to use with Vault. Overrides the VAULT_CACERT 154 environment variable and the Vault CA path. 155 156 -vault-ca-path=<path> 157 Path to a directory of PEM encoded CA cert files to verify the Vault 158 certificate. Overrides the VAULT_CAPATH environment variable. 159 ` 160 return strings.TrimSpace(helpText) 161 } 162 163 func (c *OperatorDebugCommand) Synopsis() string { 164 return "Build a debug archive" 165 } 166 167 func (c *OperatorDebugCommand) AutocompleteFlags() complete.Flags { 168 return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient), 169 complete.Flags{ 170 "-duration": complete.PredictAnything, 171 "-interval": complete.PredictAnything, 172 "-log-level": complete.PredictAnything, 173 "-max-nodes": complete.PredictAnything, 174 "-node-class": complete.PredictAnything, 175 "-node-id": complete.PredictAnything, 176 "-server-id": complete.PredictAnything, 177 "-output": complete.PredictAnything, 178 "-pprof-duration": complete.PredictAnything, 179 "-consul-token": complete.PredictAnything, 180 "-vault-token": complete.PredictAnything, 181 }) 182 } 183 184 func (c *OperatorDebugCommand) AutocompleteArgs() complete.Predictor { 185 return complete.PredictNothing 186 } 187 188 func (c *OperatorDebugCommand) Name() string { return "debug" } 189 190 func (c *OperatorDebugCommand) Run(args []string) int { 191 flags := c.Meta.FlagSet(c.Name(), FlagSetClient) 192 flags.Usage = func() { c.Ui.Output(c.Help()) } 193 194 var duration, interval, output, pprofDuration string 195 var nodeIDs, serverIDs string 196 197 flags.StringVar(&duration, "duration", "2m", "") 198 flags.StringVar(&interval, "interval", "2m", "") 199 flags.StringVar(&c.logLevel, "log-level", "DEBUG", "") 200 flags.IntVar(&c.maxNodes, "max-nodes", 10, "") 201 flags.StringVar(&c.nodeClass, "node-class", "", "") 202 flags.StringVar(&nodeIDs, "node-id", "", "") 203 flags.StringVar(&serverIDs, "server-id", "", "") 204 flags.BoolVar(&c.stale, "stale", false, "") 205 flags.StringVar(&output, "output", "", "") 206 flags.StringVar(&pprofDuration, "pprof-duration", "1s", "") 207 208 c.consul = &external{tls: &api.TLSConfig{}} 209 flags.StringVar(&c.consul.addrVal, "consul-http-addr", os.Getenv("CONSUL_HTTP_ADDR"), "") 210 ssl := os.Getenv("CONSUL_HTTP_SSL") 211 c.consul.ssl, _ = strconv.ParseBool(ssl) 212 flags.StringVar(&c.consul.auth, "consul-auth", os.Getenv("CONSUL_HTTP_AUTH"), "") 213 flags.StringVar(&c.consul.tokenVal, "consul-token", os.Getenv("CONSUL_HTTP_TOKEN"), "") 214 flags.StringVar(&c.consul.tokenFile, "consul-token-file", os.Getenv("CONSUL_HTTP_TOKEN_FILE"), "") 215 flags.StringVar(&c.consul.tls.ClientCert, "consul-client-cert", os.Getenv("CONSUL_CLIENT_CERT"), "") 216 flags.StringVar(&c.consul.tls.ClientKey, "consul-client-key", os.Getenv("CONSUL_CLIENT_KEY"), "") 217 flags.StringVar(&c.consul.tls.CACert, "consul-ca-cert", os.Getenv("CONSUL_CACERT"), "") 218 flags.StringVar(&c.consul.tls.CAPath, "consul-ca-path", os.Getenv("CONSUL_CAPATH"), "") 219 220 c.vault = &external{tls: &api.TLSConfig{}} 221 flags.StringVar(&c.vault.addrVal, "vault-address", os.Getenv("VAULT_ADDR"), "") 222 flags.StringVar(&c.vault.tokenVal, "vault-token", os.Getenv("VAULT_TOKEN"), "") 223 flags.StringVar(&c.vault.tls.CACert, "vault-ca-cert", os.Getenv("VAULT_CACERT"), "") 224 flags.StringVar(&c.vault.tls.CAPath, "vault-ca-path", os.Getenv("VAULT_CAPATH"), "") 225 flags.StringVar(&c.vault.tls.ClientCert, "vault-client-cert", os.Getenv("VAULT_CLIENT_CERT"), "") 226 flags.StringVar(&c.vault.tls.ClientKey, "vault-client-key", os.Getenv("VAULT_CLIENT_KEY"), "") 227 228 if err := flags.Parse(args); err != nil { 229 c.Ui.Error(fmt.Sprintf("Error parsing arguments: %q", err)) 230 return 1 231 } 232 233 // Parse the capture duration 234 d, err := time.ParseDuration(duration) 235 if err != nil { 236 c.Ui.Error(fmt.Sprintf("Error parsing duration: %s: %s", duration, err.Error())) 237 return 1 238 } 239 c.duration = d 240 241 // Parse the capture interval 242 i, err := time.ParseDuration(interval) 243 if err != nil { 244 c.Ui.Error(fmt.Sprintf("Error parsing interval: %s: %s", interval, err.Error())) 245 return 1 246 } 247 c.interval = i 248 249 // Parse the pprof capture duration 250 pd, err := time.ParseDuration(pprofDuration) 251 if err != nil { 252 c.Ui.Error(fmt.Sprintf("Error parsing pprof duration: %s: %s", pprofDuration, err.Error())) 253 return 1 254 } 255 c.pprofDuration = pd 256 257 // Verify there are no extra arguments 258 args = flags.Args() 259 if l := len(args); l != 0 { 260 c.Ui.Error("This command takes no arguments") 261 c.Ui.Error(commandErrorText(c)) 262 return 1 263 } 264 265 // Initialize capture variables and structs 266 c.manifest = make([]string, 0) 267 ctx, cancel := context.WithCancel(context.Background()) 268 c.ctx = ctx 269 c.cancel = cancel 270 c.trap() 271 272 // Generate timestamped file name 273 format := "2006-01-02-150405Z" 274 c.timestamp = time.Now().UTC().Format(format) 275 stamped := "nomad-debug-" + c.timestamp 276 277 // Create the output directory 278 var tmp string 279 if output != "" { 280 // User specified output directory 281 tmp = filepath.Join(output, stamped) 282 _, err := os.Stat(tmp) 283 if !os.IsNotExist(err) { 284 c.Ui.Error("Output directory already exists") 285 return 2 286 } 287 } else { 288 // Generate temp directory 289 tmp, err = ioutil.TempDir(os.TempDir(), stamped) 290 if err != nil { 291 c.Ui.Error(fmt.Sprintf("Error creating tmp directory: %s", err.Error())) 292 return 2 293 } 294 defer os.RemoveAll(tmp) 295 } 296 297 c.collectDir = tmp 298 299 // Create an instance of the API client 300 client, err := c.Meta.Client() 301 if err != nil { 302 c.Ui.Error(fmt.Sprintf("Error initializing client: %s", err.Error())) 303 return 1 304 } 305 306 // Search all nodes If a node class is specified without a list of node id prefixes 307 if c.nodeClass != "" && nodeIDs == "" { 308 nodeIDs = "all" 309 } 310 311 // Resolve client node id prefixes 312 nodesFound := 0 313 nodeLookupFailCount := 0 314 nodeCaptureCount := 0 315 316 for _, id := range argNodes(nodeIDs) { 317 if id == "all" { 318 // Capture from all nodes using empty prefix filter 319 id = "" 320 } else { 321 // Capture from nodes starting with prefix id 322 id = sanitizeUUIDPrefix(id) 323 } 324 nodes, _, err := client.Nodes().PrefixList(id) 325 if err != nil { 326 c.Ui.Error(fmt.Sprintf("Error querying node info: %s", err)) 327 return 1 328 } 329 330 // Increment fail count if no nodes are found 331 if len(nodes) == 0 { 332 c.Ui.Error(fmt.Sprintf("No node(s) with prefix %q found", id)) 333 nodeLookupFailCount++ 334 continue 335 } 336 337 nodesFound += len(nodes) 338 339 // Apply constraints to nodes found 340 for _, n := range nodes { 341 // Ignore nodes that do not match specified class 342 if c.nodeClass != "" && n.NodeClass != c.nodeClass { 343 continue 344 } 345 346 // Add node to capture list 347 c.nodeIDs = append(c.nodeIDs, n.ID) 348 nodeCaptureCount++ 349 350 // Stop looping when we reach the max 351 if c.maxNodes != 0 && nodeCaptureCount >= c.maxNodes { 352 break 353 } 354 } 355 } 356 357 // Return error if nodes were specified but none were found 358 if len(nodeIDs) > 0 && nodeCaptureCount == 0 { 359 c.Ui.Error(fmt.Sprintf("Failed to retrieve clients, 0 nodes found in list: %s", nodeIDs)) 360 return 1 361 } 362 363 // Resolve servers 364 members, err := client.Agent().Members() 365 if err != nil { 366 c.Ui.Error(fmt.Sprintf("Failed to retrieve server list; err: %v", err)) 367 return 1 368 } 369 c.writeJSON("version", "members.json", members, err) 370 // We always write the error to the file, but don't range if no members found 371 if serverIDs == "all" && members != nil { 372 // Special case to capture from all servers 373 for _, member := range members.Members { 374 c.serverIDs = append(c.serverIDs, member.Name) 375 } 376 } else { 377 c.serverIDs = append(c.serverIDs, argNodes(serverIDs)...) 378 } 379 380 serversFound := 0 381 serverCaptureCount := 0 382 383 if members != nil { 384 serversFound = len(members.Members) 385 } 386 if c.serverIDs != nil { 387 serverCaptureCount = len(c.serverIDs) 388 } 389 390 // Return error if servers were specified but not found 391 if len(serverIDs) > 0 && serverCaptureCount == 0 { 392 c.Ui.Error(fmt.Sprintf("Failed to retrieve servers, 0 members found in list: %s", serverIDs)) 393 return 1 394 } 395 396 // Display general info about the capture 397 c.Ui.Output("Starting debugger...") 398 c.Ui.Output("") 399 c.Ui.Output(fmt.Sprintf(" Servers: (%d/%d) %v", serverCaptureCount, serversFound, c.serverIDs)) 400 c.Ui.Output(fmt.Sprintf(" Clients: (%d/%d) %v", nodeCaptureCount, nodesFound, c.nodeIDs)) 401 if nodeCaptureCount > 0 && nodeCaptureCount == c.maxNodes { 402 c.Ui.Output(fmt.Sprintf(" Max node count reached (%d)", c.maxNodes)) 403 } 404 if nodeLookupFailCount > 0 { 405 c.Ui.Output(fmt.Sprintf("Client fail count: %v", nodeLookupFailCount)) 406 } 407 if c.nodeClass != "" { 408 c.Ui.Output(fmt.Sprintf(" Node Class: %s", c.nodeClass)) 409 } 410 c.Ui.Output(fmt.Sprintf(" Interval: %s", interval)) 411 c.Ui.Output(fmt.Sprintf(" Duration: %s", duration)) 412 if c.pprofDuration.Seconds() != 1 { 413 c.Ui.Output(fmt.Sprintf(" pprof Duration: %s", c.pprofDuration)) 414 } 415 c.Ui.Output("") 416 c.Ui.Output("Capturing cluster data...") 417 418 // Start collecting data 419 err = c.collect(client) 420 if err != nil { 421 c.Ui.Error(fmt.Sprintf("Error collecting data: %s", err.Error())) 422 return 2 423 } 424 425 // Write index json/html manifest files 426 c.writeManifest() 427 428 // Exit before archive if output directory was specified 429 if output != "" { 430 c.Ui.Output(fmt.Sprintf("Created debug directory: %s", c.collectDir)) 431 return 0 432 } 433 434 // Create archive tarball 435 archiveFile := stamped + ".tar.gz" 436 err = TarCZF(archiveFile, tmp, stamped) 437 if err != nil { 438 c.Ui.Error(fmt.Sprintf("Error creating archive: %s", err.Error())) 439 return 2 440 } 441 442 // Final output with name of tarball 443 c.Ui.Output(fmt.Sprintf("Created debug archive: %s", archiveFile)) 444 return 0 445 } 446 447 // collect collects data from our endpoints and writes the archive bundle 448 func (c *OperatorDebugCommand) collect(client *api.Client) error { 449 // Version contains cluster meta information 450 dir := "version" 451 452 self, err := client.Agent().Self() 453 c.writeJSON(dir, "agent-self.json", self, err) 454 455 // Fetch data directly from consul and vault. Ignore errors 456 var consul, vault string 457 458 if self != nil { 459 r, ok := self.Config["Consul"] 460 if ok { 461 m, ok := r.(map[string]interface{}) 462 if ok { 463 464 raw := m["Addr"] 465 consul, _ = raw.(string) 466 raw = m["EnableSSL"] 467 ssl, _ := raw.(bool) 468 if ssl { 469 consul = "https://" + consul 470 } else { 471 consul = "http://" + consul 472 } 473 } 474 } 475 476 r, ok = self.Config["Vault"] 477 if ok { 478 m, ok := r.(map[string]interface{}) 479 if ok { 480 raw := m["Addr"] 481 vault, _ = raw.(string) 482 } 483 } 484 } 485 486 c.collectConsul(dir, consul) 487 c.collectVault(dir, vault) 488 c.collectAgentHosts(client) 489 c.collectPprofs(client) 490 491 c.startMonitors(client) 492 c.collectPeriodic(client) 493 494 return nil 495 } 496 497 // path returns platform specific paths in the tmp root directory 498 func (c *OperatorDebugCommand) path(paths ...string) string { 499 ps := []string{c.collectDir} 500 ps = append(ps, paths...) 501 return filepath.Join(ps...) 502 } 503 504 // mkdir creates directories in the tmp root directory 505 func (c *OperatorDebugCommand) mkdir(paths ...string) error { 506 joinedPath := c.path(paths...) 507 508 // Ensure path doesn't escape the sandbox of the capture directory 509 escapes := helper.PathEscapesSandbox(c.collectDir, joinedPath) 510 if escapes { 511 return fmt.Errorf("file path escapes capture directory") 512 } 513 514 return os.MkdirAll(joinedPath, 0755) 515 } 516 517 // startMonitors starts go routines for each node and client 518 func (c *OperatorDebugCommand) startMonitors(client *api.Client) { 519 for _, id := range c.nodeIDs { 520 go c.startMonitor("client", "node_id", id, client) 521 } 522 523 for _, id := range c.serverIDs { 524 go c.startMonitor("server", "server_id", id, client) 525 } 526 } 527 528 // startMonitor starts one monitor api request, writing to a file. It blocks and should be 529 // called in a go routine. Errors are ignored, we want to build the archive even if a node 530 // is unavailable 531 func (c *OperatorDebugCommand) startMonitor(path, idKey, nodeID string, client *api.Client) { 532 c.mkdir(path, nodeID) 533 fh, err := os.Create(c.path(path, nodeID, "monitor.log")) 534 if err != nil { 535 return 536 } 537 defer fh.Close() 538 539 qo := api.QueryOptions{ 540 Params: map[string]string{ 541 idKey: nodeID, 542 "log_level": c.logLevel, 543 }, 544 } 545 546 outCh, errCh := client.Agent().Monitor(c.ctx.Done(), &qo) 547 for { 548 select { 549 case out := <-outCh: 550 if out == nil { 551 continue 552 } 553 fh.Write(out.Data) 554 fh.WriteString("\n") 555 556 case err := <-errCh: 557 fh.WriteString(fmt.Sprintf("monitor: %s\n", err.Error())) 558 return 559 560 case <-c.ctx.Done(): 561 return 562 } 563 } 564 } 565 566 // collectAgentHosts calls collectAgentHost for each selected node 567 func (c *OperatorDebugCommand) collectAgentHosts(client *api.Client) { 568 for _, n := range c.nodeIDs { 569 c.collectAgentHost("client", n, client) 570 } 571 572 for _, n := range c.serverIDs { 573 c.collectAgentHost("server", n, client) 574 } 575 } 576 577 // collectAgentHost gets the agent host data 578 func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Client) { 579 var host *api.HostDataResponse 580 var err error 581 if path == "server" { 582 host, err = client.Agent().Host(id, "", nil) 583 } else { 584 host, err = client.Agent().Host("", id, nil) 585 } 586 587 if err != nil { 588 c.Ui.Error(fmt.Sprintf("%s/%s: Failed to retrieve agent host data, err: %v", path, id, err)) 589 590 if strings.Contains(err.Error(), structs.ErrPermissionDenied.Error()) { 591 // Drop a hint to help the operator resolve the error 592 c.Ui.Warn("Agent host retrieval requires agent:read ACL or enable_debug=true. See https://www.nomadproject.io/api-docs/agent#host for more information.") 593 } 594 return // exit on any error 595 } 596 597 path = filepath.Join(path, id) 598 c.writeJSON(path, "agent-host.json", host, err) 599 } 600 601 // collectPprofs captures the /agent/pprof for each listed node 602 func (c *OperatorDebugCommand) collectPprofs(client *api.Client) { 603 for _, n := range c.nodeIDs { 604 c.collectPprof("client", n, client) 605 } 606 607 for _, n := range c.serverIDs { 608 c.collectPprof("server", n, client) 609 } 610 } 611 612 // collectPprof captures pprof data for the node 613 func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client) { 614 pprofDurationSeconds := int(c.pprofDuration.Seconds()) 615 opts := api.PprofOptions{Seconds: pprofDurationSeconds} 616 if path == "server" { 617 opts.ServerID = id 618 } else { 619 opts.NodeID = id 620 } 621 622 path = filepath.Join(path, id) 623 624 bs, err := client.Agent().CPUProfile(opts, nil) 625 if err != nil { 626 c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof profile.prof, err: %v", path, err)) 627 if structs.IsErrPermissionDenied(err) { 628 // All Profiles require the same permissions, so we only need to see 629 // one permission failure before we bail. 630 // But lets first drop a hint to help the operator resolve the error 631 632 c.Ui.Warn("Pprof retrieval requires agent:write ACL or enable_debug=true. See https://www.nomadproject.io/api-docs/agent#agent-runtime-profiles for more information.") 633 return // only exit on 403 634 } 635 } else { 636 err := c.writeBytes(path, "profile.prof", bs) 637 if err != nil { 638 c.Ui.Error(err.Error()) 639 } 640 } 641 642 bs, err = client.Agent().Trace(opts, nil) 643 if err != nil { 644 c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof trace.prof, err: %v", path, err)) 645 } else { 646 err := c.writeBytes(path, "trace.prof", bs) 647 if err != nil { 648 c.Ui.Error(err.Error()) 649 } 650 } 651 652 bs, err = client.Agent().Lookup("goroutine", opts, nil) 653 if err != nil { 654 c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof goroutine.prof, err: %v", path, err)) 655 } else { 656 err := c.writeBytes(path, "goroutine.prof", bs) 657 if err != nil { 658 c.Ui.Error(err.Error()) 659 } 660 } 661 662 // Gather goroutine text output - debug type 1 663 // debug type 1 writes the legacy text format for human readable output 664 opts.Debug = 1 665 bs, err = client.Agent().Lookup("goroutine", opts, nil) 666 if err != nil { 667 c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof goroutine-debug1.txt, err: %v", path, err)) 668 } else { 669 err := c.writeBytes(path, "goroutine-debug1.txt", bs) 670 if err != nil { 671 c.Ui.Error(err.Error()) 672 } 673 } 674 675 // Gather goroutine text output - debug type 2 676 // When printing the "goroutine" profile, debug=2 means to print the goroutine 677 // stacks in the same form that a Go program uses when dying due to an unrecovered panic. 678 opts.Debug = 2 679 bs, err = client.Agent().Lookup("goroutine", opts, nil) 680 if err != nil { 681 c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof goroutine-debug2.txt, err: %v", path, err)) 682 } else { 683 err := c.writeBytes(path, "goroutine-debug2.txt", bs) 684 if err != nil { 685 c.Ui.Error(err.Error()) 686 } 687 } 688 } 689 690 // collectPeriodic runs for duration, capturing the cluster state every interval. It flushes and stops 691 // the monitor requests 692 func (c *OperatorDebugCommand) collectPeriodic(client *api.Client) { 693 duration := time.After(c.duration) 694 // Set interval to 0 so that we immediately execute, wait the interval next time 695 interval := time.After(0 * time.Second) 696 var intervalCount int 697 var name, dir string 698 699 for { 700 select { 701 case <-duration: 702 c.cancel() 703 return 704 705 case <-interval: 706 name = fmt.Sprintf("%04d", intervalCount) 707 dir = filepath.Join("nomad", name) 708 c.Ui.Output(fmt.Sprintf(" Capture interval %s", name)) 709 c.collectNomad(dir, client) 710 c.collectOperator(dir, client) 711 interval = time.After(c.interval) 712 intervalCount++ 713 714 case <-c.ctx.Done(): 715 return 716 } 717 } 718 } 719 720 // collectOperator captures some cluster meta information 721 func (c *OperatorDebugCommand) collectOperator(dir string, client *api.Client) { 722 rc, err := client.Operator().RaftGetConfiguration(nil) 723 c.writeJSON(dir, "operator-raft.json", rc, err) 724 725 sc, _, err := client.Operator().SchedulerGetConfiguration(nil) 726 c.writeJSON(dir, "operator-scheduler.json", sc, err) 727 728 ah, _, err := client.Operator().AutopilotServerHealth(nil) 729 c.writeJSON(dir, "operator-autopilot-health.json", ah, err) 730 731 lic, _, err := client.Operator().LicenseGet(nil) 732 c.writeJSON(dir, "license.json", lic, err) 733 } 734 735 // collectNomad captures the nomad cluster state 736 func (c *OperatorDebugCommand) collectNomad(dir string, client *api.Client) error { 737 var qo *api.QueryOptions 738 739 js, _, err := client.Jobs().List(qo) 740 c.writeJSON(dir, "jobs.json", js, err) 741 742 ds, _, err := client.Deployments().List(qo) 743 c.writeJSON(dir, "deployments.json", ds, err) 744 745 es, _, err := client.Evaluations().List(qo) 746 c.writeJSON(dir, "evaluations.json", es, err) 747 748 as, _, err := client.Allocations().List(qo) 749 c.writeJSON(dir, "allocations.json", as, err) 750 751 ns, _, err := client.Nodes().List(qo) 752 c.writeJSON(dir, "nodes.json", ns, err) 753 754 // CSI Plugins - /v1/plugins?type=csi 755 ps, _, err := client.CSIPlugins().List(qo) 756 c.writeJSON(dir, "plugins.json", ps, err) 757 758 // CSI Plugin details - /v1/plugin/csi/:plugin_id 759 for _, p := range ps { 760 csiPlugin, _, err := client.CSIPlugins().Info(p.ID, qo) 761 csiPluginFileName := fmt.Sprintf("csi-plugin-id-%s.json", p.ID) 762 c.writeJSON(dir, csiPluginFileName, csiPlugin, err) 763 } 764 765 // CSI Volumes - /v1/volumes?type=csi 766 csiVolumes, _, err := client.CSIVolumes().List(qo) 767 c.writeJSON(dir, "csi-volumes.json", csiVolumes, err) 768 769 // CSI Volume details - /v1/volumes/csi/:volume-id 770 for _, v := range csiVolumes { 771 csiVolume, _, err := client.CSIVolumes().Info(v.ID, qo) 772 csiFileName := fmt.Sprintf("csi-volume-id-%s.json", v.ID) 773 c.writeJSON(dir, csiFileName, csiVolume, err) 774 } 775 776 metrics, _, err := client.Operator().MetricsSummary(qo) 777 c.writeJSON(dir, "metrics.json", metrics, err) 778 779 return nil 780 } 781 782 // collectConsul calls the Consul API directly to collect data 783 func (c *OperatorDebugCommand) collectConsul(dir, consul string) error { 784 addr := c.consul.addr(consul) 785 if addr == "" { 786 return nil 787 } 788 789 client := defaultHttpClient() 790 api.ConfigureTLS(client, c.consul.tls) 791 792 req, _ := http.NewRequest("GET", addr+"/v1/agent/self", nil) 793 req.Header.Add("X-Consul-Token", c.consul.token()) 794 req.Header.Add("User-Agent", userAgent) 795 resp, err := client.Do(req) 796 c.writeBody(dir, "consul-agent-self.json", resp, err) 797 798 req, _ = http.NewRequest("GET", addr+"/v1/agent/members", nil) 799 req.Header.Add("X-Consul-Token", c.consul.token()) 800 req.Header.Add("User-Agent", userAgent) 801 resp, err = client.Do(req) 802 c.writeBody(dir, "consul-agent-members.json", resp, err) 803 804 return nil 805 } 806 807 // collectVault calls the Vault API directly to collect data 808 func (c *OperatorDebugCommand) collectVault(dir, vault string) error { 809 addr := c.vault.addr(vault) 810 if addr == "" { 811 return nil 812 } 813 814 client := defaultHttpClient() 815 api.ConfigureTLS(client, c.vault.tls) 816 817 req, _ := http.NewRequest("GET", addr+"/sys/health", nil) 818 req.Header.Add("X-Vault-Token", c.vault.token()) 819 req.Header.Add("User-Agent", userAgent) 820 resp, err := client.Do(req) 821 c.writeBody(dir, "vault-sys-health.json", resp, err) 822 823 return nil 824 } 825 826 // writeBytes writes a file to the archive, recording it in the manifest 827 func (c *OperatorDebugCommand) writeBytes(dir, file string, data []byte) error { 828 // Replace invalid characters in filename 829 filename := helper.CleanFilename(file, "_") 830 831 relativePath := filepath.Join(dir, filename) 832 c.manifest = append(c.manifest, relativePath) 833 dirPath := filepath.Join(c.collectDir, dir) 834 filePath := filepath.Join(dirPath, filename) 835 836 // Ensure parent directories exist 837 err := os.MkdirAll(dirPath, os.ModePerm) 838 if err != nil { 839 return fmt.Errorf("failed to create parent directories of \"%s\": %w", dirPath, err) 840 } 841 842 // Ensure filename doesn't escape the sandbox of the capture directory 843 escapes := helper.PathEscapesSandbox(c.collectDir, filePath) 844 if escapes { 845 return fmt.Errorf("file path \"%s\" escapes capture directory \"%s\"", filePath, c.collectDir) 846 } 847 848 // Create the file 849 fh, err := os.Create(filePath) 850 if err != nil { 851 return fmt.Errorf("failed to create file \"%s\", err: %w", filePath, err) 852 } 853 defer fh.Close() 854 855 _, err = fh.Write(data) 856 if err != nil { 857 return fmt.Errorf("Failed to write data to file \"%s\", err: %w", filePath, err) 858 } 859 return nil 860 } 861 862 // writeJSON writes JSON responses from the Nomad API calls to the archive 863 func (c *OperatorDebugCommand) writeJSON(dir, file string, data interface{}, err error) error { 864 if err != nil { 865 return c.writeError(dir, file, err) 866 } 867 bytes, err := json.Marshal(data) 868 if err != nil { 869 return c.writeError(dir, file, err) 870 } 871 err = c.writeBytes(dir, file, bytes) 872 if err != nil { 873 c.Ui.Error(err.Error()) 874 } 875 return nil 876 } 877 878 // writeError writes a JSON error object to capture errors in the debug bundle without 879 // reporting 880 func (c *OperatorDebugCommand) writeError(dir, file string, err error) error { 881 bytes, err := json.Marshal(errorWrapper{Error: err.Error()}) 882 if err != nil { 883 return err 884 } 885 return c.writeBytes(dir, file, bytes) 886 } 887 888 type errorWrapper struct { 889 Error string 890 } 891 892 // writeBody is a helper that writes the body of an http.Response to the archive 893 func (c *OperatorDebugCommand) writeBody(dir, file string, resp *http.Response, err error) { 894 if err != nil { 895 c.writeError(dir, file, err) 896 return 897 } 898 899 if resp.ContentLength == 0 { 900 return 901 } 902 903 defer resp.Body.Close() 904 905 body, err := ioutil.ReadAll(resp.Body) 906 if err != nil { 907 c.writeError(dir, file, err) 908 return 909 } 910 911 if err := c.writeBytes(dir, file, body); err != nil { 912 c.Ui.Error(err.Error()) 913 } 914 } 915 916 // writeManifest creates the index files 917 func (c *OperatorDebugCommand) writeManifest() error { 918 // Write the JSON 919 path := filepath.Join(c.collectDir, "index.json") 920 jsonFh, err := os.Create(path) 921 if err != nil { 922 return err 923 } 924 defer jsonFh.Close() 925 926 json.NewEncoder(jsonFh).Encode(c.manifest) 927 928 // Write the HTML 929 path = filepath.Join(c.collectDir, "index.html") 930 htmlFh, err := os.Create(path) 931 if err != nil { 932 return err 933 } 934 defer htmlFh.Close() 935 936 head, _ := template.New("head").Parse("<html><head><title>{{.}}</title></head>\n<body><h1>{{.}}</h1>\n<ul>") 937 line, _ := template.New("line").Parse("<li><a href=\"{{.}}\">{{.}}</a></li>\n") 938 if err != nil { 939 return fmt.Errorf("%v", err) 940 } 941 tail := "</ul></body></html>\n" 942 943 head.Execute(htmlFh, c.timestamp) 944 for _, f := range c.manifest { 945 line.Execute(htmlFh, f) 946 } 947 htmlFh.WriteString(tail) 948 949 return nil 950 } 951 952 // trap captures signals, and closes stopCh 953 func (c *OperatorDebugCommand) trap() { 954 sigCh := make(chan os.Signal, 1) 955 signal.Notify(sigCh, 956 syscall.SIGHUP, 957 syscall.SIGINT, 958 syscall.SIGTERM, 959 syscall.SIGQUIT) 960 961 go func() { 962 <-sigCh 963 c.cancel() 964 }() 965 } 966 967 // TarCZF, like the tar command, recursively builds a gzip compressed tar archive from a 968 // directory. If not empty, all files in the bundle are prefixed with the target path 969 func TarCZF(archive string, src, target string) error { 970 // ensure the src actually exists before trying to tar it 971 if _, err := os.Stat(src); err != nil { 972 return fmt.Errorf("Unable to tar files - %v", err.Error()) 973 } 974 975 // create the archive 976 fh, err := os.Create(archive) 977 if err != nil { 978 return err 979 } 980 defer fh.Close() 981 982 zz := gzip.NewWriter(fh) 983 defer zz.Close() 984 985 tw := tar.NewWriter(zz) 986 defer tw.Close() 987 988 // tar 989 return filepath.Walk(src, func(file string, fi os.FileInfo, err error) error { 990 991 // return on any error 992 if err != nil { 993 return err 994 } 995 996 if !fi.Mode().IsRegular() { 997 return nil 998 } 999 1000 header, err := tar.FileInfoHeader(fi, fi.Name()) 1001 if err != nil { 1002 return err 1003 } 1004 1005 // remove leading path to the src, so files are relative to the archive 1006 path := strings.Replace(file, src, "", -1) 1007 if target != "" { 1008 path = filepath.Join([]string{target, path}...) 1009 } 1010 path = strings.TrimPrefix(path, string(filepath.Separator)) 1011 1012 header.Name = path 1013 1014 if err := tw.WriteHeader(header); err != nil { 1015 return err 1016 } 1017 1018 // copy the file contents 1019 f, err := os.Open(file) 1020 if err != nil { 1021 return err 1022 } 1023 1024 if _, err := io.Copy(tw, f); err != nil { 1025 return err 1026 } 1027 1028 f.Close() 1029 1030 return nil 1031 }) 1032 } 1033 1034 // argNodes splits node ids from the command line by "," 1035 func argNodes(input string) []string { 1036 ns := strings.Split(input, ",") 1037 var out []string 1038 for _, n := range ns { 1039 s := strings.TrimSpace(n) 1040 if s == "" { 1041 continue 1042 } 1043 out = append(out, s) 1044 } 1045 return out 1046 } 1047 1048 // external holds address configuration for Consul and Vault APIs 1049 type external struct { 1050 tls *api.TLSConfig 1051 addrVal string 1052 auth string 1053 ssl bool 1054 tokenVal string 1055 tokenFile string 1056 } 1057 1058 func (e *external) addr(defaultAddr string) string { 1059 if e.addrVal == "" { 1060 return defaultAddr 1061 } 1062 1063 if !e.ssl { 1064 if strings.HasPrefix(e.addrVal, "http:") { 1065 return e.addrVal 1066 } 1067 if strings.HasPrefix(e.addrVal, "https:") { 1068 // Mismatch: e.ssl=false but addrVal is https 1069 return strings.ReplaceAll(e.addrVal, "https://", "http://") 1070 } 1071 return "http://" + e.addrVal 1072 } 1073 1074 if strings.HasPrefix(e.addrVal, "https:") { 1075 return e.addrVal 1076 } 1077 1078 if strings.HasPrefix(e.addrVal, "http:") { 1079 // Mismatch: e.ssl=true but addrVal is http 1080 return strings.ReplaceAll(e.addrVal, "http://", "https://") 1081 } 1082 1083 return "https://" + e.addrVal 1084 } 1085 1086 func (e *external) token() string { 1087 if e.tokenVal != "" { 1088 return e.tokenVal 1089 } 1090 1091 if e.tokenFile != "" { 1092 bs, err := ioutil.ReadFile(e.tokenFile) 1093 if err == nil { 1094 return strings.TrimSpace(string(bs)) 1095 } 1096 } 1097 1098 return "" 1099 } 1100 1101 // defaultHttpClient configures a basic httpClient 1102 func defaultHttpClient() *http.Client { 1103 httpClient := cleanhttp.DefaultClient() 1104 transport := httpClient.Transport.(*http.Transport) 1105 transport.TLSHandshakeTimeout = 10 * time.Second 1106 transport.TLSClientConfig = &tls.Config{ 1107 MinVersion: tls.VersionTLS12, 1108 } 1109 1110 return httpClient 1111 }