github.com/hernad/nomad@v1.6.112/command/operator_debug.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package command 5 6 import ( 7 "archive/tar" 8 "compress/gzip" 9 "context" 10 "crypto/tls" 11 "encoding/json" 12 "errors" 13 "flag" 14 "fmt" 15 "html/template" 16 "io" 17 "net/http" 18 "os" 19 "os/signal" 20 "path/filepath" 21 "strconv" 22 "strings" 23 "syscall" 24 "time" 25 26 "github.com/hashicorp/go-cleanhttp" 27 "github.com/hashicorp/go-multierror" 28 goversion "github.com/hashicorp/go-version" 29 "github.com/hernad/nomad/api" 30 "github.com/hernad/nomad/api/contexts" 31 "github.com/hernad/nomad/helper" 32 "github.com/hernad/nomad/helper/escapingfs" 33 "github.com/hernad/nomad/version" 34 "github.com/posener/complete" 35 "golang.org/x/exp/maps" 36 "golang.org/x/exp/slices" 37 ) 38 39 type OperatorDebugCommand struct { 40 Meta 41 42 timestamp string 43 collectDir string 44 duration time.Duration 45 interval time.Duration 46 pprofInterval time.Duration 47 pprofDuration time.Duration 48 logLevel string 49 maxNodes int 50 nodeClass string 51 nodeIDs []string 52 serverIDs []string 53 topics map[api.Topic][]string 54 index uint64 55 consul *external 56 vault *external 57 manifest []string 58 ctx context.Context 59 cancel context.CancelFunc 60 opts *api.QueryOptions 61 verbose bool 62 members *api.ServerMembers 63 nodes []*api.NodeListStub 64 } 65 66 const ( 67 userAgent = "nomad operator debug" 68 clusterDir = "cluster" 69 clientDir = "client" 70 serverDir = "server" 71 intervalDir = "interval" 72 minimumVersionPprofConstraint = ">= 0.11.0, <= 0.11.2" 73 ) 74 75 func (c *OperatorDebugCommand) Help() string { 76 helpText := ` 77 Usage: nomad operator debug [options] 78 79 Build an archive containing Nomad cluster configuration and state, and Consul 80 and Vault status. Include logs and pprof profiles for selected servers and 81 client nodes. 82 83 If ACLs are enabled, this command will require a token with the 'node:read' 84 capability to run. In order to collect information, the token will also 85 require the 'agent:read' and 'operator:read' capabilities, as well as the 86 'list-jobs' capability for all namespaces. To collect pprof profiles the 87 token will also require 'agent:write', or enable_debug configuration set to 88 true. 89 90 If event stream capture is enabled, the Job, Allocation, Deployment, 91 and Evaluation topics require 'namespace:read-job' capabilities, the Node 92 topic requires 'node:read'. A 'management' token is required to capture 93 ACLToken, ACLPolicy, or all all events. 94 95 General Options: 96 97 ` + generalOptionsUsage(usageOptsDefault|usageOptsNoNamespace) + ` 98 99 Consul Options: 100 101 -consul-http-addr=<addr> 102 The address and port of the Consul HTTP agent. Overrides the 103 CONSUL_HTTP_ADDR environment variable. 104 105 -consul-token=<token> 106 Token used to query Consul. Overrides the CONSUL_HTTP_TOKEN environment 107 variable and the Consul token file. 108 109 -consul-token-file=<path> 110 Path to the Consul token file. Overrides the CONSUL_HTTP_TOKEN_FILE 111 environment variable. 112 113 -consul-client-cert=<path> 114 Path to the Consul client cert file. Overrides the CONSUL_CLIENT_CERT 115 environment variable. 116 117 -consul-client-key=<path> 118 Path to the Consul client key file. Overrides the CONSUL_CLIENT_KEY 119 environment variable. 120 121 -consul-ca-cert=<path> 122 Path to a CA file to use with Consul. Overrides the CONSUL_CACERT 123 environment variable and the Consul CA path. 124 125 -consul-ca-path=<path> 126 Path to a directory of PEM encoded CA cert files to verify the Consul 127 certificate. Overrides the CONSUL_CAPATH environment variable. 128 129 Vault Options: 130 131 -vault-address=<addr> 132 The address and port of the Vault HTTP agent. Overrides the VAULT_ADDR 133 environment variable. 134 135 -vault-token=<token> 136 Token used to query Vault. Overrides the VAULT_TOKEN environment 137 variable. 138 139 -vault-client-cert=<path> 140 Path to the Vault client cert file. Overrides the VAULT_CLIENT_CERT 141 environment variable. 142 143 -vault-client-key=<path> 144 Path to the Vault client key file. Overrides the VAULT_CLIENT_KEY 145 environment variable. 146 147 -vault-ca-cert=<path> 148 Path to a CA file to use with Vault. Overrides the VAULT_CACERT 149 environment variable and the Vault CA path. 150 151 -vault-ca-path=<path> 152 Path to a directory of PEM encoded CA cert files to verify the Vault 153 certificate. Overrides the VAULT_CAPATH environment variable. 154 155 Debug Options: 156 157 -duration=<duration> 158 Set the duration of the debug capture. Logs will be captured from specified servers and 159 nodes at "log-level". Defaults to 2m. 160 161 -event-index=<index> 162 Specifies the index to start streaming events from. If the requested index is 163 no longer in the buffer the stream will start at the next available index. 164 Defaults to 0. 165 166 -event-topic=<Allocation,Evaluation,Job,Node,*>:<filter> 167 Enable event stream capture, filtered by comma delimited list of topic filters. 168 Examples: 169 "all" or "*:*" for all events 170 "Evaluation" or "Evaluation:*" for all evaluation events 171 "*:example" for all events related to the job "example" 172 Defaults to "none" (disabled). 173 174 -interval=<interval> 175 The interval between snapshots of the Nomad state. Set interval equal to 176 duration to capture a single snapshot. Defaults to 30s. 177 178 -log-level=<level> 179 The log level to monitor. Defaults to DEBUG. 180 181 -max-nodes=<count> 182 Cap the maximum number of client nodes included in the capture. Defaults 183 to 10, set to 0 for unlimited. 184 185 -node-id=<node1>,<node2> 186 Comma separated list of Nomad client node ids to monitor for logs, API 187 outputs, and pprof profiles. Accepts id prefixes, and "all" to select all 188 nodes (up to count = max-nodes). Defaults to "all". 189 190 -node-class=<node-class> 191 Filter client nodes based on node class. 192 193 -pprof-duration=<duration> 194 Duration for pprof collection. Defaults to 1s or -duration, whichever is less. 195 196 -pprof-interval=<pprof-interval> 197 The interval between pprof collections. Set interval equal to 198 duration to capture a single snapshot. Defaults to 250ms or 199 -pprof-duration, whichever is less. 200 201 -server-id=<server1>,<server2> 202 Comma separated list of Nomad server names to monitor for logs, API 203 outputs, and pprof profiles. Accepts server names, "leader", or "all". 204 Defaults to "all". 205 206 -stale=<true|false> 207 If "false", the default, get membership data from the cluster leader. If 208 the cluster is in an outage unable to establish leadership, it may be 209 necessary to get the configuration from a non-leader server. 210 211 -output=<path> 212 Path to the parent directory of the output directory. If specified, no 213 archive is built. Defaults to the current directory. 214 215 -verbose 216 Enable verbose output. 217 ` 218 return strings.TrimSpace(helpText) 219 } 220 221 func (c *OperatorDebugCommand) Synopsis() string { 222 return "Build a debug archive" 223 } 224 225 func (c *OperatorDebugCommand) AutocompleteFlags() complete.Flags { 226 return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient), 227 complete.Flags{ 228 "-duration": complete.PredictAnything, 229 "-event-index": complete.PredictAnything, 230 "-event-topic": complete.PredictAnything, 231 "-interval": complete.PredictAnything, 232 "-log-level": complete.PredictSet("TRACE", "DEBUG", "INFO", "WARN", "ERROR"), 233 "-max-nodes": complete.PredictAnything, 234 "-node-class": NodeClassPredictor(c.Client), 235 "-node-id": NodePredictor(c.Client), 236 "-server-id": ServerPredictor(c.Client), 237 "-output": complete.PredictDirs("*"), 238 "-pprof-duration": complete.PredictAnything, 239 "-consul-token": complete.PredictAnything, 240 "-vault-token": complete.PredictAnything, 241 "-verbose": complete.PredictAnything, 242 }) 243 } 244 245 func (c *OperatorDebugCommand) AutocompleteArgs() complete.Predictor { 246 return complete.PredictNothing 247 } 248 249 // NodePredictor returns a client node predictor 250 func NodePredictor(factory ApiClientFactory) complete.Predictor { 251 return complete.PredictFunc(func(a complete.Args) []string { 252 client, err := factory() 253 if err != nil { 254 return nil 255 } 256 257 // note we can't use the -stale flag here because we're in the 258 // predictor, but a stale query should be safe for prediction; 259 // we also can't use region forwarding because we can't rely 260 // on the server being up 261 resp, _, err := client.Search().PrefixSearch( 262 a.Last, contexts.Nodes, &api.QueryOptions{AllowStale: true}) 263 if err != nil { 264 return []string{} 265 } 266 return resp.Matches[contexts.Nodes] 267 }) 268 } 269 270 // NodeClassPredictor returns a client node class predictor 271 // TODO dmay: Consider API options for node class filtering 272 func NodeClassPredictor(factory ApiClientFactory) complete.Predictor { 273 return complete.PredictFunc(func(a complete.Args) []string { 274 client, err := factory() 275 if err != nil { 276 return nil 277 } 278 279 // note we can't use the -stale flag here because we're in the 280 // predictor, but a stale query should be safe for prediction; 281 // we also can't use region forwarding because we can't rely 282 // on the server being up 283 nodes, _, err := client.Nodes().List(&api.QueryOptions{AllowStale: true}) 284 if err != nil { 285 return []string{} 286 } 287 288 // Build map of unique node classes across all nodes 289 classes := make(map[string]bool) 290 for _, node := range nodes { 291 classes[node.NodeClass] = true 292 } 293 294 // Iterate over node classes looking for match 295 filtered := []string{} 296 for class := range classes { 297 if strings.HasPrefix(class, a.Last) { 298 filtered = append(filtered, class) 299 } 300 } 301 302 return filtered 303 }) 304 } 305 306 // ServerPredictor returns a server member predictor 307 // TODO dmay: Consider API options for server member filtering 308 func ServerPredictor(factory ApiClientFactory) complete.Predictor { 309 return complete.PredictFunc(func(a complete.Args) []string { 310 client, err := factory() 311 if err != nil { 312 return nil 313 } 314 315 // note we can't use the -stale flag here because we're in the 316 // predictor, but a stale query should be safe for prediction; 317 // we also can't use region forwarding because we can't rely 318 // on the server being up 319 members, err := client.Agent().MembersOpts(&api.QueryOptions{AllowStale: true}) 320 if err != nil { 321 return []string{} 322 } 323 324 // Iterate over server members looking for match 325 filtered := []string{} 326 for _, member := range members.Members { 327 if strings.HasPrefix(member.Name, a.Last) { 328 filtered = append(filtered, member.Name) 329 } 330 } 331 332 return filtered 333 }) 334 } 335 336 // queryOpts returns a copy of the shared api.QueryOptions so 337 // that api package methods can safely modify the options 338 func (c *OperatorDebugCommand) queryOpts() *api.QueryOptions { 339 qo := new(api.QueryOptions) 340 *qo = *c.opts 341 qo.Params = maps.Clone(c.opts.Params) 342 return qo 343 } 344 345 func (c *OperatorDebugCommand) Name() string { return "debug" } 346 347 func (c *OperatorDebugCommand) Run(args []string) int { 348 flags := c.Meta.FlagSet(c.Name(), FlagSetClient) 349 flags.Usage = func() { c.Ui.Output(c.Help()) } 350 351 var duration, interval, pprofInterval, output, pprofDuration, eventTopic string 352 var eventIndex int64 353 var nodeIDs, serverIDs string 354 var allowStale bool 355 356 flags.StringVar(&duration, "duration", "2m", "") 357 flags.Int64Var(&eventIndex, "event-index", 0, "") 358 flags.StringVar(&eventTopic, "event-topic", "none", "") 359 flags.StringVar(&interval, "interval", "30s", "") 360 flags.StringVar(&c.logLevel, "log-level", "DEBUG", "") 361 flags.IntVar(&c.maxNodes, "max-nodes", 10, "") 362 flags.StringVar(&c.nodeClass, "node-class", "", "") 363 flags.StringVar(&nodeIDs, "node-id", "all", "") 364 flags.StringVar(&serverIDs, "server-id", "all", "") 365 flags.BoolVar(&allowStale, "stale", false, "") 366 flags.StringVar(&output, "output", "", "") 367 flags.StringVar(&pprofDuration, "pprof-duration", "1s", "") 368 flags.StringVar(&pprofInterval, "pprof-interval", "250ms", "") 369 flags.BoolVar(&c.verbose, "verbose", false, "") 370 371 c.consul = &external{tls: &api.TLSConfig{}} 372 flags.StringVar(&c.consul.addrVal, "consul-http-addr", os.Getenv("CONSUL_HTTP_ADDR"), "") 373 ssl := os.Getenv("CONSUL_HTTP_SSL") 374 c.consul.ssl, _ = strconv.ParseBool(ssl) 375 flags.StringVar(&c.consul.auth, "consul-auth", os.Getenv("CONSUL_HTTP_AUTH"), "") 376 flags.StringVar(&c.consul.tokenVal, "consul-token", os.Getenv("CONSUL_HTTP_TOKEN"), "") 377 flags.StringVar(&c.consul.tokenFile, "consul-token-file", os.Getenv("CONSUL_HTTP_TOKEN_FILE"), "") 378 flags.StringVar(&c.consul.tls.ClientCert, "consul-client-cert", os.Getenv("CONSUL_CLIENT_CERT"), "") 379 flags.StringVar(&c.consul.tls.ClientKey, "consul-client-key", os.Getenv("CONSUL_CLIENT_KEY"), "") 380 flags.StringVar(&c.consul.tls.CACert, "consul-ca-cert", os.Getenv("CONSUL_CACERT"), "") 381 flags.StringVar(&c.consul.tls.CAPath, "consul-ca-path", os.Getenv("CONSUL_CAPATH"), "") 382 383 c.vault = &external{tls: &api.TLSConfig{}} 384 flags.StringVar(&c.vault.addrVal, "vault-address", os.Getenv("VAULT_ADDR"), "") 385 flags.StringVar(&c.vault.tokenVal, "vault-token", os.Getenv("VAULT_TOKEN"), "") 386 flags.StringVar(&c.vault.tls.CACert, "vault-ca-cert", os.Getenv("VAULT_CACERT"), "") 387 flags.StringVar(&c.vault.tls.CAPath, "vault-ca-path", os.Getenv("VAULT_CAPATH"), "") 388 flags.StringVar(&c.vault.tls.ClientCert, "vault-client-cert", os.Getenv("VAULT_CLIENT_CERT"), "") 389 flags.StringVar(&c.vault.tls.ClientKey, "vault-client-key", os.Getenv("VAULT_CLIENT_KEY"), "") 390 391 if err := flags.Parse(args); err != nil { 392 c.Ui.Error(fmt.Sprintf("Error parsing arguments: %q", err)) 393 return 1 394 } 395 396 // Parse the capture duration 397 d, err := time.ParseDuration(duration) 398 if err != nil { 399 c.Ui.Error(fmt.Sprintf("Error parsing duration: %s: %s", duration, err.Error())) 400 return 1 401 } 402 c.duration = d 403 404 // Parse the capture interval 405 i, err := time.ParseDuration(interval) 406 if err != nil { 407 c.Ui.Error(fmt.Sprintf("Error parsing interval: %s: %s", interval, err.Error())) 408 return 1 409 } 410 c.interval = i 411 412 // Validate interval 413 if i.Seconds() > d.Seconds() { 414 c.Ui.Error(fmt.Sprintf("Error parsing interval: %s is greater than duration %s", interval, duration)) 415 return 1 416 } 417 418 // Parse and clamp the pprof capture duration 419 pd, err := time.ParseDuration(pprofDuration) 420 if err != nil { 421 c.Ui.Error(fmt.Sprintf("Error parsing pprof duration: %s: %s", pprofDuration, err.Error())) 422 return 1 423 } 424 if pd.Seconds() > d.Seconds() { 425 pd = d 426 } 427 c.pprofDuration = pd 428 429 // Parse and clamp the pprof capture interval 430 pi, err := time.ParseDuration(pprofInterval) 431 if err != nil { 432 c.Ui.Error(fmt.Sprintf("Error parsing pprof-interval: %s: %s", pprofInterval, err.Error())) 433 return 1 434 } 435 if pi.Seconds() > pd.Seconds() { 436 pi = pd 437 } 438 c.pprofInterval = pi 439 440 // Parse event stream topic filter 441 t, err := topicsFromString(eventTopic) 442 if err != nil { 443 c.Ui.Error(fmt.Sprintf("Error parsing event topics: %v", err)) 444 return 1 445 } 446 c.topics = t 447 448 // Validate and set initial event stream index 449 if eventIndex < 0 { 450 c.Ui.Error("Event stream index must be greater than zero") 451 return 1 452 } 453 c.index = uint64(eventIndex) 454 455 // Verify there are no extra arguments 456 args = flags.Args() 457 if l := len(args); l != 0 { 458 c.Ui.Error("This command takes no arguments") 459 c.Ui.Error(commandErrorText(c)) 460 return 1 461 } 462 463 // Initialize capture variables and structs 464 c.manifest = make([]string, 0) 465 ctx, cancel := context.WithCancel(context.Background()) 466 c.ctx = ctx 467 c.cancel = cancel 468 c.trap() 469 470 // Generate timestamped file name 471 format := "2006-01-02-150405Z" 472 c.timestamp = time.Now().UTC().Format(format) 473 stamped := "nomad-debug-" + c.timestamp 474 475 // Create the output directory 476 var tmp string 477 if output != "" { 478 // User specified output directory 479 tmp = filepath.Join(output, stamped) 480 _, err := os.Stat(tmp) 481 if !os.IsNotExist(err) { 482 c.Ui.Error("Output directory already exists") 483 return 2 484 } 485 } else { 486 // Generate temp directory 487 tmp, err = os.MkdirTemp(os.TempDir(), stamped) 488 if err != nil { 489 c.Ui.Error(fmt.Sprintf("Error creating tmp directory: %s", err.Error())) 490 return 2 491 } 492 defer os.RemoveAll(tmp) 493 } 494 495 c.collectDir = tmp 496 497 // Write CLI flags to JSON file 498 c.writeFlags(flags) 499 500 // Create an instance of the API client 501 client, err := c.Meta.Client() 502 if err != nil { 503 c.Ui.Error(fmt.Sprintf("Error initializing client: %s", err.Error())) 504 return 1 505 } 506 507 c.opts = &api.QueryOptions{ 508 Region: c.Meta.region, 509 AllowStale: allowStale, 510 AuthToken: c.Meta.token, 511 } 512 513 // Get complete list of client nodes 514 c.nodes, _, err = client.Nodes().List(c.queryOpts()) 515 if err != nil { 516 c.Ui.Error(fmt.Sprintf("Error querying node info: %v", err)) 517 return 1 518 } 519 520 // Write nodes to file 521 c.reportErr(writeResponseToFile(c.nodes, c.newFile(clusterDir, "nodes.json"))) 522 523 // Search all nodes If a node class is specified without a list of node id prefixes 524 if c.nodeClass != "" && nodeIDs == "" { 525 nodeIDs = "all" 526 } 527 528 // Resolve client node id prefixes 529 nodesFound := 0 530 nodeLookupFailCount := 0 531 nodeCaptureCount := 0 532 533 for _, id := range stringToSlice(nodeIDs) { 534 if id == "all" { 535 // Capture from all nodes using empty prefix filter 536 id = "" 537 } else { 538 // Capture from nodes starting with prefix id 539 id = sanitizeUUIDPrefix(id) 540 } 541 nodes, _, err := client.Nodes().PrefixListOpts(id, c.queryOpts()) 542 if err != nil { 543 c.Ui.Error(fmt.Sprintf("Error querying node info: %s", err)) 544 return 1 545 } 546 547 // Increment fail count if no nodes are found 548 if len(nodes) == 0 { 549 c.Ui.Error(fmt.Sprintf("No node(s) with prefix %q found", id)) 550 nodeLookupFailCount++ 551 continue 552 } 553 554 nodesFound += len(nodes) 555 556 // Apply constraints to nodes found 557 for _, n := range nodes { 558 // Ignore nodes that do not match specified class 559 if c.nodeClass != "" && n.NodeClass != c.nodeClass { 560 continue 561 } 562 563 // Add node to capture list 564 c.nodeIDs = append(c.nodeIDs, n.ID) 565 nodeCaptureCount++ 566 567 // Stop looping when we reach the max 568 if c.maxNodes != 0 && nodeCaptureCount >= c.maxNodes { 569 break 570 } 571 } 572 } 573 574 // Return error if nodes were specified but none were found 575 if len(nodeIDs) > 0 && nodeCaptureCount == 0 { 576 if nodeIDs == "all" { 577 // It's okay to have zero clients for default "all" 578 c.Ui.Info("Note: \"-node-id=all\" specified but no clients found") 579 } else { 580 c.Ui.Error(fmt.Sprintf("Failed to retrieve clients, 0 nodes found in list: %s", nodeIDs)) 581 return 1 582 } 583 } 584 585 // Resolve servers 586 c.members, err = client.Agent().MembersOpts(c.queryOpts()) 587 if err != nil { 588 c.Ui.Error(fmt.Sprintf("Failed to retrieve server list; err: %v", err)) 589 return 1 590 } 591 592 // Write complete list of server members to file 593 c.reportErr(writeResponseToFile(c.members, c.newFile(clusterDir, "members.json"))) 594 595 // Get leader and write to file; there's no option for AllowStale 596 // on this API and a stale result wouldn't even be meaningful, so 597 // only warn if we fail so that we don't stop the rest of the 598 // debugging 599 leader, err := client.Status().Leader() 600 if err != nil { 601 c.Ui.Warn(fmt.Sprintf("Failed to retrieve leader; err: %v", err)) 602 } 603 if len(leader) > 0 { 604 c.reportErr(writeResponseToFile(leader, c.newFile(clusterDir, "leader.json"))) 605 } 606 607 // Filter for servers matching criteria 608 c.serverIDs, err = filterServerMembers(c.members, serverIDs, c.region) 609 if err != nil { 610 c.Ui.Error(fmt.Sprintf("Failed to parse server list; err: %v", err)) 611 return 1 612 } 613 614 serversFound := 0 615 serverCaptureCount := 0 616 617 if c.members != nil { 618 serversFound = len(c.members.Members) 619 } 620 if c.serverIDs != nil { 621 serverCaptureCount = len(c.serverIDs) 622 } 623 624 // Return error if servers were specified but not found 625 if len(serverIDs) > 0 && serverCaptureCount == 0 { 626 c.Ui.Error(fmt.Sprintf("Failed to retrieve servers, 0 members found in list: %s", serverIDs)) 627 return 1 628 } 629 630 // Display general info about the capture 631 c.Ui.Output("Starting debugger...") 632 c.Ui.Output("") 633 c.Ui.Output(fmt.Sprintf("Nomad CLI Version: %s", version.GetVersion().FullVersionNumber(true))) 634 c.Ui.Output(fmt.Sprintf(" Region: %s", c.region)) 635 c.Ui.Output(fmt.Sprintf(" Namespace: %s", c.namespace)) 636 c.Ui.Output(fmt.Sprintf(" Servers: (%d/%d) %v", serverCaptureCount, serversFound, c.serverIDs)) 637 c.Ui.Output(fmt.Sprintf(" Clients: (%d/%d) %v", nodeCaptureCount, nodesFound, c.nodeIDs)) 638 if nodeCaptureCount > 0 && nodeCaptureCount == c.maxNodes { 639 c.Ui.Output(fmt.Sprintf(" Max node count reached (%d)", c.maxNodes)) 640 } 641 if nodeLookupFailCount > 0 { 642 c.Ui.Output(fmt.Sprintf("Client fail count: %v", nodeLookupFailCount)) 643 } 644 if c.nodeClass != "" { 645 c.Ui.Output(fmt.Sprintf(" Node Class: %s", c.nodeClass)) 646 } 647 c.Ui.Output(fmt.Sprintf(" Interval: %s", interval)) 648 c.Ui.Output(fmt.Sprintf(" Duration: %s", duration)) 649 c.Ui.Output(fmt.Sprintf(" pprof Interval: %s", pprofInterval)) 650 if c.pprofDuration.Seconds() != 1 { 651 c.Ui.Output(fmt.Sprintf(" pprof Duration: %s", c.pprofDuration)) 652 } 653 if c.topics != nil { 654 c.Ui.Output(fmt.Sprintf(" Event topics: %+v", c.topics)) 655 } 656 c.Ui.Output("") 657 c.Ui.Output("Capturing cluster data...") 658 659 // Start collecting data 660 err = c.collect(client) 661 if err != nil { 662 c.Ui.Error(fmt.Sprintf("Error collecting data: %s", err.Error())) 663 return 2 664 } 665 666 // Write index json/html manifest files 667 c.writeManifest() 668 669 // Exit before archive if output directory was specified 670 if output != "" { 671 c.Ui.Output(fmt.Sprintf("Created debug directory: %s", c.collectDir)) 672 return 0 673 } 674 675 // Create archive tarball 676 archiveFile := stamped + ".tar.gz" 677 err = TarCZF(archiveFile, tmp, stamped) 678 if err != nil { 679 c.Ui.Error(fmt.Sprintf("Error creating archive: %s", err.Error())) 680 return 2 681 } 682 683 // Final output with name of tarball 684 c.Ui.Output(fmt.Sprintf("Created debug archive: %s", archiveFile)) 685 return 0 686 } 687 688 // collect collects data from our endpoints and writes the archive bundle 689 func (c *OperatorDebugCommand) collect(client *api.Client) error { 690 // Start background captures 691 c.startMonitors(client) 692 c.startEventStream(client) 693 694 // Collect cluster data 695 self, err := client.Agent().Self() 696 c.reportErr(writeResponseOrErrorToFile( 697 self, err, c.newFile(clusterDir, "agent-self.json"))) 698 699 namespaces, _, err := client.Namespaces().List(c.queryOpts()) 700 c.reportErr(writeResponseOrErrorToFile( 701 namespaces, err, c.newFile(clusterDir, "namespaces.json"))) 702 703 regions, err := client.Regions().List() 704 c.reportErr(writeResponseOrErrorToFile( 705 regions, err, c.newFile(clusterDir, "regions.json"))) 706 707 // Collect data from Consul 708 if c.consul.addrVal == "" { 709 c.getConsulAddrFromSelf(self) 710 } 711 c.collectConsul(clusterDir) 712 713 // Collect data from Vault 714 vaultAddr := c.vault.addrVal 715 if vaultAddr == "" { 716 vaultAddr = c.getVaultAddrFromSelf(self) 717 } 718 c.collectVault(clusterDir, vaultAddr) 719 720 c.collectAgentHosts(client) 721 c.collectPeriodicPprofs(client) 722 723 c.collectPeriodic(client) 724 725 return nil 726 } 727 728 // path returns platform specific paths in the tmp root directory 729 func (c *OperatorDebugCommand) path(paths ...string) string { 730 ps := []string{c.collectDir} 731 ps = append(ps, paths...) 732 return filepath.Join(ps...) 733 } 734 735 // mkdir creates directories in the tmp root directory 736 func (c *OperatorDebugCommand) mkdir(paths ...string) error { 737 joinedPath := c.path(paths...) 738 739 // Ensure path doesn't escape the sandbox of the capture directory 740 escapes := escapingfs.PathEscapesSandbox(c.collectDir, joinedPath) 741 if escapes { 742 return fmt.Errorf("file path escapes capture directory") 743 } 744 745 return escapingfs.EnsurePath(joinedPath, true) 746 } 747 748 // startMonitors starts go routines for each node and client 749 func (c *OperatorDebugCommand) startMonitors(client *api.Client) { 750 for _, id := range c.nodeIDs { 751 go c.startMonitor(clientDir, "node_id", id, client) 752 } 753 754 for _, id := range c.serverIDs { 755 go c.startMonitor(serverDir, "server_id", id, client) 756 } 757 } 758 759 // startMonitor starts one monitor api request, writing to a file. It blocks and should be 760 // called in a go routine. Errors are ignored, we want to build the archive even if a node 761 // is unavailable 762 func (c *OperatorDebugCommand) startMonitor(path, idKey, nodeID string, client *api.Client) { 763 c.mkdir(path, nodeID) 764 fh, err := os.Create(c.path(path, nodeID, "monitor.log")) 765 if err != nil { 766 return 767 } 768 defer fh.Close() 769 770 qo := api.QueryOptions{ 771 Params: map[string]string{ 772 idKey: nodeID, 773 "log_level": c.logLevel, 774 }, 775 AllowStale: c.queryOpts().AllowStale, 776 } 777 778 outCh, errCh := client.Agent().Monitor(c.ctx.Done(), &qo) 779 for { 780 select { 781 case out := <-outCh: 782 if out == nil { 783 continue 784 } 785 fh.Write(out.Data) 786 787 case err := <-errCh: 788 fh.WriteString(fmt.Sprintf("monitor: %s\n", err.Error())) 789 return 790 791 case <-c.ctx.Done(): 792 return 793 } 794 } 795 } 796 797 // captureEventStream wraps the event stream capture process. 798 func (c *OperatorDebugCommand) startEventStream(client *api.Client) { 799 c.verboseOut("Launching eventstream goroutine...") 800 801 go func() { 802 if err := c.captureEventStream(client); err != nil { 803 var es string 804 if mErr, ok := err.(*multierror.Error); ok { 805 es = multierror.ListFormatFunc(mErr.Errors) 806 } else { 807 es = err.Error() 808 } 809 810 c.Ui.Error(fmt.Sprintf("Error capturing event stream: %s", es)) 811 } 812 }() 813 } 814 815 func (c *OperatorDebugCommand) captureEventStream(client *api.Client) error { 816 // Ensure output directory is present 817 path := clusterDir 818 if err := c.mkdir(c.path(path)); err != nil { 819 return err 820 } 821 822 // Create the output file 823 fh, err := os.Create(c.path(path, "eventstream.json")) 824 if err != nil { 825 return err 826 } 827 defer fh.Close() 828 829 // Get handle to events endpoint 830 events := client.EventStream() 831 832 // Start streaming events 833 eventCh, err := events.Stream(c.ctx, c.topics, c.index, c.queryOpts()) 834 if err != nil { 835 if errors.Is(err, context.Canceled) { 836 c.verboseOut("Event stream canceled: No events captured") 837 return nil 838 } 839 return fmt.Errorf("failed to stream events: %w", err) 840 } 841 842 eventCount := 0 843 errCount := 0 844 heartbeatCount := 0 845 channelEventCount := 0 846 847 var mErrs *multierror.Error 848 849 for { 850 select { 851 case event := <-eventCh: 852 channelEventCount++ 853 if event.Err != nil { 854 errCount++ 855 c.verboseOutf("error from event stream: index; %d err: %v", event.Index, event.Err) 856 mErrs = multierror.Append(mErrs, fmt.Errorf("error at index: %d, Err: %w", event.Index, event.Err)) 857 break 858 } 859 860 if event.IsHeartbeat() { 861 heartbeatCount++ 862 continue 863 } 864 865 for _, e := range event.Events { 866 eventCount++ 867 c.verboseOutf("Event: %4d, Index: %d, Topic: %-10s, Type: %s, FilterKeys: %s", eventCount, e.Index, e.Topic, e.Type, e.FilterKeys) 868 869 bytes, err := json.Marshal(e) 870 if err != nil { 871 errCount++ 872 mErrs = multierror.Append(mErrs, fmt.Errorf("failed to marshal json from Topic: %s, Type: %s, Err: %w", e.Topic, e.Type, err)) 873 } 874 875 n, err := fh.Write(bytes) 876 if err != nil { 877 errCount++ 878 mErrs = multierror.Append(mErrs, fmt.Errorf("failed to write bytes to eventstream.json; bytes written: %d, Err: %w", n, err)) 879 break 880 } 881 n, err = fh.WriteString("\n") 882 if err != nil { 883 errCount++ 884 mErrs = multierror.Append(mErrs, fmt.Errorf("failed to write string to eventstream.json; chars written: %d, Err: %w", n, err)) 885 } 886 } 887 case <-c.ctx.Done(): 888 c.verboseOutf("Event stream captured %d events, %d frames, %d heartbeats, %d errors", eventCount, channelEventCount, heartbeatCount, errCount) 889 return mErrs.ErrorOrNil() 890 } 891 } 892 } 893 894 // collectAgentHosts calls collectAgentHost for each selected node 895 func (c *OperatorDebugCommand) collectAgentHosts(client *api.Client) { 896 for _, n := range c.nodeIDs { 897 c.collectAgentHost(clientDir, n, client) 898 } 899 900 for _, n := range c.serverIDs { 901 c.collectAgentHost(serverDir, n, client) 902 } 903 } 904 905 // collectAgentHost gets the agent host data 906 func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Client) { 907 var host *api.HostDataResponse 908 var err error 909 if path == serverDir { 910 host, err = client.Agent().Host(id, "", c.queryOpts()) 911 } else { 912 host, err = client.Agent().Host("", id, c.queryOpts()) 913 } 914 915 if isRedirectError(err) { 916 c.Ui.Warn(fmt.Sprintf("%s/%s: /v1/agent/host unavailable on this agent", path, id)) 917 return 918 } 919 920 if err != nil { 921 c.Ui.Error(fmt.Sprintf("%s/%s: Failed to retrieve agent host data, err: %v", path, id, err)) 922 923 if strings.Contains(err.Error(), api.PermissionDeniedErrorContent) { 924 // Drop a hint to help the operator resolve the error 925 c.Ui.Warn("Agent host retrieval requires agent:read ACL or enable_debug=true. See https://www.nomadproject.io/api-docs/agent#host for more information.") 926 } 927 return // exit on any error 928 } 929 930 path = filepath.Join(path, id) 931 c.reportErr(writeResponseToFile(host, c.newFile(path, "agent-host.json"))) 932 } 933 934 func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) { 935 936 pprofNodeIDs := []string{} 937 pprofServerIDs := []string{} 938 939 // threadcreate pprof causes a panic on Nomad 0.11.0 to 0.11.2 -- skip those versions 940 for _, serverID := range c.serverIDs { 941 version := c.getNomadVersion(serverID, "") 942 err := checkVersion(version, minimumVersionPprofConstraint) 943 if err != nil { 944 c.Ui.Warn(fmt.Sprintf("Skipping pprof: %v", err)) 945 } 946 pprofServerIDs = append(pprofServerIDs, serverID) 947 } 948 949 for _, nodeID := range c.nodeIDs { 950 version := c.getNomadVersion("", nodeID) 951 err := checkVersion(version, minimumVersionPprofConstraint) 952 if err != nil { 953 c.Ui.Warn(fmt.Sprintf("Skipping pprof: %v", err)) 954 } 955 pprofNodeIDs = append(pprofNodeIDs, nodeID) 956 } 957 958 // Take the first set of pprofs synchronously... 959 c.Ui.Output(" Capture pprofInterval 0000") 960 c.collectPprofs(client, pprofServerIDs, pprofNodeIDs, 0) 961 if c.pprofInterval == c.pprofDuration { 962 return 963 } 964 965 // ... and then move the rest off into a goroutine 966 go func() { 967 ctx, cancel := context.WithTimeout(c.ctx, c.duration) 968 defer cancel() 969 timer, stop := helper.NewSafeTimer(c.pprofInterval) 970 defer stop() 971 972 pprofIntervalCount := 1 973 for { 974 select { 975 case <-ctx.Done(): 976 return 977 case <-timer.C: 978 c.Ui.Output(fmt.Sprintf(" Capture pprofInterval %04d", pprofIntervalCount)) 979 c.collectPprofs(client, pprofServerIDs, pprofNodeIDs, pprofIntervalCount) 980 timer.Reset(c.pprofInterval) 981 pprofIntervalCount++ 982 } 983 } 984 }() 985 } 986 987 // collectPprofs captures the /agent/pprof for each listed node 988 func (c *OperatorDebugCommand) collectPprofs(client *api.Client, serverIDs, nodeIDs []string, interval int) { 989 for _, n := range nodeIDs { 990 c.collectPprof(clientDir, n, client, interval) 991 } 992 993 for _, n := range serverIDs { 994 c.collectPprof(serverDir, n, client, interval) 995 } 996 } 997 998 // collectPprof captures pprof data for the node 999 func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client, interval int) { 1000 pprofDurationSeconds := int(c.pprofDuration.Seconds()) 1001 opts := api.PprofOptions{Seconds: pprofDurationSeconds} 1002 if path == serverDir { 1003 opts.ServerID = id 1004 } else { 1005 opts.NodeID = id 1006 } 1007 1008 path = filepath.Join(path, id) 1009 filename := fmt.Sprintf("profile_%04d.prof", interval) 1010 1011 bs, err := client.Agent().CPUProfile(opts, c.queryOpts()) 1012 if err != nil { 1013 c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof %s, err: %v", filename, path, err)) 1014 if strings.Contains(err.Error(), api.PermissionDeniedErrorContent) { 1015 // All Profiles require the same permissions, so we only need to see 1016 // one permission failure before we bail. 1017 // But lets first drop a hint to help the operator resolve the error 1018 1019 c.Ui.Warn("Pprof retrieval requires agent:write ACL or enable_debug=true. See https://www.nomadproject.io/api-docs/agent#agent-runtime-profiles for more information.") 1020 return // only exit on 403 1021 } 1022 } else { 1023 err := c.writeBytes(path, filename, bs) 1024 if err != nil { 1025 c.Ui.Error(err.Error()) 1026 } 1027 } 1028 1029 // goroutine debug type 1 = legacy text format for human readable output 1030 opts.Debug = 1 1031 c.savePprofProfile(path, "goroutine", opts, client) 1032 1033 // goroutine debug type 2 = goroutine stacks in panic format 1034 opts.Debug = 2 1035 c.savePprofProfile(path, "goroutine", opts, client) 1036 1037 // Reset to pprof binary format 1038 opts.Debug = 0 1039 1040 c.savePprofProfile(path, "goroutine", opts, client) // Stack traces of all current goroutines 1041 c.savePprofProfile(path, "trace", opts, client) // A trace of execution of the current program 1042 c.savePprofProfile(path, "heap", opts, client) // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample. 1043 c.savePprofProfile(path, "allocs", opts, client) // A sampling of all past memory allocations 1044 c.savePprofProfile(path, "threadcreate", opts, client) // Stack traces that led to the creation of new OS threads 1045 } 1046 1047 // savePprofProfile retrieves a pprof profile and writes to disk 1048 func (c *OperatorDebugCommand) savePprofProfile(path string, profile string, opts api.PprofOptions, client *api.Client) { 1049 fileName := fmt.Sprintf("%s.prof", profile) 1050 if opts.Debug > 0 { 1051 fileName = fmt.Sprintf("%s-debug%d.txt", profile, opts.Debug) 1052 } 1053 1054 bs, err := retrievePprofProfile(profile, opts, client, c.queryOpts()) 1055 if err != nil { 1056 c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof %s, err: %s", path, fileName, err.Error())) 1057 } 1058 1059 err = c.writeBytes(path, fileName, bs) 1060 if err != nil { 1061 c.Ui.Error(fmt.Sprintf("%s: Failed to write file %s, err: %s", path, fileName, err.Error())) 1062 } 1063 } 1064 1065 // retrievePprofProfile gets a pprof profile from the node specified 1066 // in opts using the API client 1067 func retrievePprofProfile(profile string, opts api.PprofOptions, client *api.Client, qopts *api.QueryOptions) (bs []byte, err error) { 1068 switch profile { 1069 case "cpuprofile": 1070 bs, err = client.Agent().CPUProfile(opts, qopts) 1071 case "trace": 1072 bs, err = client.Agent().Trace(opts, qopts) 1073 default: 1074 bs, err = client.Agent().Lookup(profile, opts, qopts) 1075 } 1076 1077 return bs, err 1078 } 1079 1080 // collectPeriodic runs for duration, capturing the cluster state 1081 // every interval. It flushes and stops the monitor requests 1082 func (c *OperatorDebugCommand) collectPeriodic(client *api.Client) { 1083 duration := time.After(c.duration) 1084 // Set interval to 0 so that we immediately execute, wait the interval next time 1085 interval := time.After(0 * time.Second) 1086 var intervalCount int 1087 var name, dir string 1088 1089 for { 1090 select { 1091 case <-duration: 1092 c.cancel() 1093 return 1094 1095 case <-interval: 1096 name = fmt.Sprintf("%04d", intervalCount) 1097 dir = filepath.Join(intervalDir, name) 1098 c.Ui.Output(fmt.Sprintf(" Capture interval %s", name)) 1099 c.collectNomad(dir, client) 1100 c.collectOperator(dir, client) 1101 interval = time.After(c.interval) 1102 intervalCount++ 1103 1104 case <-c.ctx.Done(): 1105 return 1106 } 1107 } 1108 } 1109 1110 // collectOperator captures some cluster meta information 1111 func (c *OperatorDebugCommand) collectOperator(dir string, client *api.Client) { 1112 rc, err := client.Operator().RaftGetConfiguration(c.queryOpts()) 1113 c.reportErr(writeResponseOrErrorToFile(rc, err, c.newFile(dir, "operator-raft.json"))) 1114 1115 sc, _, err := client.Operator().SchedulerGetConfiguration(c.queryOpts()) 1116 c.reportErr(writeResponseOrErrorToFile(sc, err, c.newFile(dir, "operator-scheduler.json"))) 1117 1118 ah, _, err := client.Operator().AutopilotServerHealth(c.queryOpts()) 1119 c.reportErr(writeResponseOrErrorToFile( 1120 ah, err, c.newFile(dir, "operator-autopilot-health.json"))) 1121 1122 lic, _, err := client.Operator().LicenseGet(c.queryOpts()) 1123 c.reportErr(writeResponseOrErrorToFile(lic, err, c.newFile(dir, "license.json"))) 1124 } 1125 1126 // collectNomad captures the nomad cluster state 1127 func (c *OperatorDebugCommand) collectNomad(dir string, client *api.Client) error { 1128 1129 js, _, err := client.Jobs().List(c.queryOpts()) 1130 c.reportErr(writeResponseStreamOrErrorToFile(js, err, c.newFile(dir, "jobs.json"))) 1131 1132 ds, _, err := client.Deployments().List(c.queryOpts()) 1133 c.reportErr(writeResponseStreamOrErrorToFile(ds, err, c.newFile(dir, "deployments.json"))) 1134 1135 es, _, err := client.Evaluations().List(c.queryOpts()) 1136 c.reportErr(writeResponseStreamOrErrorToFile(es, err, c.newFile(dir, "evaluations.json"))) 1137 1138 as, _, err := client.Allocations().List(c.queryOpts()) 1139 c.reportErr(writeResponseStreamOrErrorToFile(as, err, c.newFile(dir, "allocations.json"))) 1140 1141 ns, _, err := client.Nodes().List(c.queryOpts()) 1142 c.reportErr(writeResponseStreamOrErrorToFile(ns, err, c.newFile(dir, "nodes.json"))) 1143 1144 // CSI Plugins - /v1/plugins?type=csi 1145 ps, _, err := client.CSIPlugins().List(c.queryOpts()) 1146 c.reportErr(writeResponseStreamOrErrorToFile(ps, err, c.newFile(dir, "csi-plugins.json"))) 1147 1148 // CSI Plugin details - /v1/plugin/csi/:plugin_id 1149 for _, p := range ps { 1150 csiPlugin, _, err := client.CSIPlugins().Info(p.ID, c.queryOpts()) 1151 csiPluginFileName := fmt.Sprintf("csi-plugin-id-%s.json", p.ID) 1152 c.reportErr(writeResponseOrErrorToFile(csiPlugin, err, c.newFile(dir, csiPluginFileName))) 1153 } 1154 1155 // CSI Volumes - /v1/volumes?type=csi 1156 csiVolumes, _, err := client.CSIVolumes().List(c.queryOpts()) 1157 c.reportErr(writeResponseStreamOrErrorToFile( 1158 csiVolumes, err, c.newFile(dir, "csi-volumes.json"))) 1159 1160 // CSI Volume details - /v1/volumes/csi/:volume-id 1161 for _, v := range csiVolumes { 1162 csiVolume, _, err := client.CSIVolumes().Info(v.ID, c.queryOpts()) 1163 csiFileName := fmt.Sprintf("csi-volume-id-%s.json", v.ID) 1164 c.reportErr(writeResponseOrErrorToFile(csiVolume, err, c.newFile(dir, csiFileName))) 1165 } 1166 1167 metrics, _, err := client.Operator().MetricsSummary(c.queryOpts()) 1168 c.reportErr(writeResponseOrErrorToFile(metrics, err, c.newFile(dir, "metrics.json"))) 1169 1170 return nil 1171 } 1172 1173 // collectConsul calls the Consul API to collect data 1174 func (c *OperatorDebugCommand) collectConsul(dir string) { 1175 if c.consul.addrVal == "" { 1176 c.Ui.Output("Consul - Skipping, no API address found") 1177 return 1178 } 1179 1180 c.Ui.Info(fmt.Sprintf("Consul - Collecting Consul API data from: %s", c.consul.addrVal)) 1181 1182 client, err := c.consulAPIClient() 1183 if err != nil { 1184 c.Ui.Error(fmt.Sprintf("failed to create Consul API client: %s", err)) 1185 return 1186 } 1187 1188 // Exit if we are unable to retrieve the leader 1189 err = c.collectConsulAPIRequest(client, "/v1/status/leader", dir, "consul-leader.json") 1190 if err != nil { 1191 c.Ui.Output(fmt.Sprintf("Unable to contact Consul leader, skipping: %s", err)) 1192 return 1193 } 1194 1195 c.collectConsulAPI(client, "/v1/agent/host", dir, "consul-agent-host.json") 1196 c.collectConsulAPI(client, "/v1/agent/members", dir, "consul-agent-members.json") 1197 c.collectConsulAPI(client, "/v1/agent/metrics", dir, "consul-agent-metrics.json") 1198 c.collectConsulAPI(client, "/v1/agent/self", dir, "consul-agent-self.json") 1199 } 1200 1201 func (c *OperatorDebugCommand) consulAPIClient() (*http.Client, error) { 1202 httpClient := defaultHttpClient() 1203 1204 err := api.ConfigureTLS(httpClient, c.consul.tls) 1205 if err != nil { 1206 return nil, fmt.Errorf("failed to configure TLS: %w", err) 1207 } 1208 1209 return httpClient, nil 1210 } 1211 1212 func (c *OperatorDebugCommand) collectConsulAPI(client *http.Client, urlPath string, dir string, file string) { 1213 err := c.collectConsulAPIRequest(client, urlPath, dir, file) 1214 if err != nil { 1215 c.Ui.Error(fmt.Sprintf("Error collecting from Consul API: %s", err.Error())) 1216 } 1217 } 1218 1219 func (c *OperatorDebugCommand) collectConsulAPIRequest(client *http.Client, urlPath string, dir string, file string) error { 1220 url := c.consul.addrVal + urlPath 1221 1222 req, err := http.NewRequest("GET", url, nil) 1223 if err != nil { 1224 return fmt.Errorf("failed to create HTTP request for Consul API URL=%q: %w", url, err) 1225 } 1226 1227 req.Header.Add("X-Consul-Token", c.consul.token()) 1228 req.Header.Add("User-Agent", userAgent) 1229 1230 resp, err := client.Do(req) 1231 if err != nil { 1232 return err 1233 } 1234 1235 c.writeBody(dir, file, resp, err) 1236 1237 return nil 1238 } 1239 1240 // collectVault calls the Vault API directly to collect data 1241 func (c *OperatorDebugCommand) collectVault(dir, vault string) error { 1242 vaultAddr := c.vault.addr(vault) 1243 if vaultAddr == "" { 1244 return nil 1245 } 1246 1247 c.Ui.Info(fmt.Sprintf("Vault - Collecting Vault API data from: %s", vaultAddr)) 1248 client := defaultHttpClient() 1249 if c.vault.ssl { 1250 err := api.ConfigureTLS(client, c.vault.tls) 1251 if err != nil { 1252 return fmt.Errorf("failed to configure TLS: %w", err) 1253 } 1254 } 1255 1256 req, err := http.NewRequest("GET", vaultAddr+"/v1/sys/health", nil) 1257 if err != nil { 1258 return fmt.Errorf("failed to create HTTP request for Vault API URL=%q: %w", vaultAddr, err) 1259 } 1260 1261 req.Header.Add("X-Vault-Token", c.vault.token()) 1262 req.Header.Add("User-Agent", userAgent) 1263 resp, err := client.Do(req) 1264 c.writeBody(dir, "vault-sys-health.json", resp, err) 1265 1266 return nil 1267 } 1268 1269 // writeBytes writes a file to the archive, recording it in the manifest 1270 func (c *OperatorDebugCommand) writeBytes(dir, file string, data []byte) error { 1271 // Replace invalid characters in filename 1272 filename := helper.CleanFilename(file, "_") 1273 1274 relativePath := filepath.Join(dir, filename) 1275 c.manifest = append(c.manifest, relativePath) 1276 dirPath := filepath.Join(c.collectDir, dir) 1277 filePath := filepath.Join(dirPath, filename) 1278 1279 // Ensure parent directories exist 1280 err := escapingfs.EnsurePath(dirPath, true) 1281 if err != nil { 1282 return fmt.Errorf("failed to create parent directories of %q: %w", dirPath, err) 1283 } 1284 1285 // Ensure filename doesn't escape the sandbox of the capture directory 1286 escapes := escapingfs.PathEscapesSandbox(c.collectDir, filePath) 1287 if escapes { 1288 return fmt.Errorf("file path %q escapes capture directory %q", filePath, c.collectDir) 1289 } 1290 1291 // Create the file 1292 fh, err := os.Create(filePath) 1293 if err != nil { 1294 return fmt.Errorf("failed to create file %q, err: %w", filePath, err) 1295 } 1296 defer fh.Close() 1297 1298 _, err = fh.Write(data) 1299 if err != nil { 1300 return fmt.Errorf("Failed to write data to file %q, err: %w", filePath, err) 1301 } 1302 return nil 1303 } 1304 1305 // newFilePath returns a validated filepath rooted in the provided directory and 1306 // path. It has been checked that it falls inside the sandbox and has been added 1307 // to the manifest tracking. 1308 func (c *OperatorDebugCommand) newFilePath(dir, file string) (string, error) { 1309 1310 // Replace invalid characters in filename 1311 filename := helper.CleanFilename(file, "_") 1312 1313 relativePath := filepath.Join(dir, filename) 1314 c.manifest = append(c.manifest, relativePath) 1315 dirPath := filepath.Join(c.collectDir, dir) 1316 filePath := filepath.Join(dirPath, filename) 1317 1318 // Ensure parent directories exist 1319 err := escapingfs.EnsurePath(dirPath, true) 1320 if err != nil { 1321 return "", fmt.Errorf("failed to create parent directories of %q: %w", dirPath, err) 1322 } 1323 1324 // Ensure filename doesn't escape the sandbox of the capture directory 1325 escapes := escapingfs.PathEscapesSandbox(c.collectDir, filePath) 1326 if escapes { 1327 return "", fmt.Errorf("file path %q escapes capture directory %q", filePath, c.collectDir) 1328 } 1329 1330 return filePath, nil 1331 } 1332 1333 type writerGetter func() (io.WriteCloser, error) 1334 1335 // newFile returns a func that creates a new file for writing and returns it as 1336 // an io.WriterCloser interface. The caller is responsible for closing the 1337 // io.Writer when its done. 1338 // 1339 // Note: methods cannot be generic in go, so this function returns a function 1340 // that closes over our command so that we can still reference the command 1341 // object's fields to validate the file. In future iterations it might be nice 1342 // if we could move most of the command into standalone functions. 1343 func (c *OperatorDebugCommand) newFile(dir, file string) writerGetter { 1344 return func() (io.WriteCloser, error) { 1345 filePath, err := c.newFilePath(dir, file) 1346 if err != nil { 1347 return nil, err 1348 } 1349 1350 writer, err := os.Create(filePath) 1351 if err != nil { 1352 return nil, fmt.Errorf("failed to create file %q: %w", filePath, err) 1353 } 1354 return writer, nil 1355 } 1356 } 1357 1358 // writeResponseToFile writes a response object to a file. It returns an error 1359 // that the caller should report to the UI. 1360 func writeResponseToFile(obj any, getWriterFn writerGetter) error { 1361 1362 writer, err := getWriterFn() 1363 if err != nil { 1364 return err 1365 } 1366 defer writer.Close() 1367 1368 err = writeJSON(obj, writer) 1369 if err != nil { 1370 return err 1371 } 1372 return nil 1373 } 1374 1375 // writeResponseOrErrorToFile writes a response object to a file, or the error 1376 // for that response if one was received. It returns an error that the caller 1377 // should report to the UI. 1378 func writeResponseOrErrorToFile(obj any, apiErr error, getWriterFn writerGetter) error { 1379 1380 writer, err := getWriterFn() 1381 if err != nil { 1382 return err 1383 } 1384 defer writer.Close() 1385 1386 if apiErr != nil { 1387 obj = errorWrapper{Error: apiErr.Error()} 1388 } 1389 1390 err = writeJSON(obj, writer) 1391 if err != nil { 1392 return err 1393 } 1394 return nil 1395 } 1396 1397 // writeResponseStreamOrErrorToFile writes a stream of response objects to a 1398 // file in newline-delimited JSON format, or the error for that response if one 1399 // was received. It returns an error that the caller should report to the UI. 1400 func writeResponseStreamOrErrorToFile[T any](obj []T, apiErr error, getWriterFn writerGetter) error { 1401 1402 writer, err := getWriterFn() 1403 if err != nil { 1404 return err 1405 } 1406 defer writer.Close() 1407 1408 if apiErr != nil { 1409 wrapped := errorWrapper{Error: apiErr.Error()} 1410 return writeJSON(wrapped, writer) 1411 } 1412 1413 err = writeNDJSON(obj, writer) 1414 if err != nil { 1415 return err 1416 } 1417 return nil 1418 } 1419 1420 // writeNDJSON writes a single Nomad API objects (or response error) to the 1421 // archive file as a JSON object. 1422 func writeJSON(obj any, writer io.Writer) error { 1423 buf, err := json.Marshal(obj) 1424 if err != nil { 1425 buf, err = json.Marshal(errorWrapper{Error: err.Error()}) 1426 if err != nil { 1427 return fmt.Errorf("could not serialize our own error: %v", err) 1428 } 1429 } 1430 n, err := writer.Write(buf) 1431 if err != nil { 1432 return fmt.Errorf("write error, wrote %d bytes of %d: %v", n, len(buf), err) 1433 } 1434 return nil 1435 } 1436 1437 // writeNDJSON writes a slice of Nomad API objects to the archive file as 1438 // newline-delimited JSON objects. 1439 func writeNDJSON[T any](data []T, writer io.Writer) error { 1440 for _, obj := range data { 1441 err := writeJSON(obj, writer) 1442 if err != nil { 1443 return fmt.Errorf("failed to write to file: %w", err) 1444 } 1445 _, err = writer.Write([]byte{'\n'}) 1446 if err != nil { 1447 return fmt.Errorf("failed to write to file: %w", err) 1448 } 1449 } 1450 1451 return nil 1452 } 1453 1454 // writeError writes a JSON error object to capture errors in the debug bundle without 1455 // reporting 1456 func (c *OperatorDebugCommand) writeError(dir, file string, err error) error { 1457 bytes, err := json.Marshal(errorWrapper{Error: err.Error()}) 1458 if err != nil { 1459 return err 1460 } 1461 return c.writeBytes(dir, file, bytes) 1462 } 1463 1464 type errorWrapper struct { 1465 Error string 1466 } 1467 1468 // writeBody is a helper that writes the body of an http.Response to the archive 1469 func (c *OperatorDebugCommand) writeBody(dir, file string, resp *http.Response, err error) { 1470 if err != nil { 1471 c.writeError(dir, file, err) 1472 return 1473 } 1474 1475 if resp.ContentLength == 0 { 1476 return 1477 } 1478 1479 defer resp.Body.Close() 1480 1481 body, err := io.ReadAll(resp.Body) 1482 if err != nil { 1483 c.writeError(dir, file, err) 1484 return 1485 } 1486 1487 if err := c.writeBytes(dir, file, body); err != nil { 1488 c.Ui.Error(err.Error()) 1489 } 1490 } 1491 1492 type flagExport struct { 1493 Name string 1494 Parsed bool 1495 Actual map[string]*flag.Flag 1496 Formal map[string]*flag.Flag 1497 Effective map[string]*flag.Flag // All flags with non-empty value 1498 Args []string // arguments after flags 1499 OsArgs []string 1500 } 1501 1502 // writeFlags exports the CLI flags to JSON file 1503 func (c *OperatorDebugCommand) writeFlags(flags *flag.FlagSet) { 1504 1505 var f flagExport 1506 f.Name = flags.Name() 1507 f.Parsed = flags.Parsed() 1508 f.Formal = make(map[string]*flag.Flag) 1509 f.Actual = make(map[string]*flag.Flag) 1510 f.Effective = make(map[string]*flag.Flag) 1511 f.Args = flags.Args() 1512 f.OsArgs = os.Args 1513 1514 // Formal flags (all flags) 1515 flags.VisitAll(func(flagA *flag.Flag) { 1516 f.Formal[flagA.Name] = flagA 1517 1518 // Determine which of thees are "effective" flags by comparing to empty string 1519 if flagA.Value.String() != "" { 1520 f.Effective[flagA.Name] = flagA 1521 } 1522 }) 1523 // Actual flags (everything passed on cmdline) 1524 flags.Visit(func(flag *flag.Flag) { 1525 f.Actual[flag.Name] = flag 1526 }) 1527 1528 c.reportErr(writeResponseToFile(f, c.newFile(clusterDir, "cli-flags.json"))) 1529 } 1530 1531 func (c *OperatorDebugCommand) reportErr(err error) { 1532 if err != nil { 1533 c.Ui.Error(err.Error()) 1534 } 1535 } 1536 1537 // writeManifest creates the index files 1538 func (c *OperatorDebugCommand) writeManifest() error { 1539 // Write the JSON 1540 path := filepath.Join(c.collectDir, "index.json") 1541 jsonFh, err := os.Create(path) 1542 if err != nil { 1543 return err 1544 } 1545 defer jsonFh.Close() 1546 1547 json.NewEncoder(jsonFh).Encode(c.manifest) 1548 1549 // Write the HTML 1550 path = filepath.Join(c.collectDir, "index.html") 1551 htmlFh, err := os.Create(path) 1552 if err != nil { 1553 return err 1554 } 1555 defer htmlFh.Close() 1556 1557 head, _ := template.New("head").Parse("<html><head><title>{{.}}</title></head>\n<body><h1>{{.}}</h1>\n<ul>") 1558 line, _ := template.New("line").Parse("<li><a href=\"{{.}}\">{{.}}</a></li>\n") 1559 if err != nil { 1560 return fmt.Errorf("%v", err) 1561 } 1562 tail := "</ul></body></html>\n" 1563 1564 head.Execute(htmlFh, c.timestamp) 1565 for _, f := range c.manifest { 1566 line.Execute(htmlFh, f) 1567 } 1568 htmlFh.WriteString(tail) 1569 1570 return nil 1571 } 1572 1573 // trap captures signals, and closes stopCh 1574 func (c *OperatorDebugCommand) trap() { 1575 sigCh := make(chan os.Signal, 1) 1576 signal.Notify(sigCh, 1577 syscall.SIGHUP, 1578 syscall.SIGINT, 1579 syscall.SIGTERM, 1580 syscall.SIGQUIT) 1581 1582 go func() { 1583 <-sigCh 1584 c.cancel() 1585 }() 1586 } 1587 1588 func (c *OperatorDebugCommand) verboseOut(out string) { 1589 if c.verbose { 1590 c.Ui.Output(out) 1591 } 1592 } 1593 1594 func (c *OperatorDebugCommand) verboseOutf(format string, a ...interface{}) { 1595 c.verboseOut(fmt.Sprintf(format, a...)) 1596 } 1597 1598 // TarCZF like the tar command, recursively builds a gzip compressed tar 1599 // archive from a directory. If not empty, all files in the bundle are prefixed 1600 // with the target path. 1601 func TarCZF(archive string, src, target string) error { 1602 // ensure the src actually exists before trying to tar it 1603 if _, err := os.Stat(src); err != nil { 1604 return fmt.Errorf("Unable to tar files - %v", err.Error()) 1605 } 1606 1607 // create the archive 1608 fh, err := os.Create(archive) 1609 if err != nil { 1610 return err 1611 } 1612 defer fh.Close() 1613 1614 zz := gzip.NewWriter(fh) 1615 defer zz.Close() 1616 1617 tw := tar.NewWriter(zz) 1618 defer tw.Close() 1619 1620 // tar 1621 return filepath.Walk(src, func(file string, fi os.FileInfo, err error) error { 1622 1623 // return on any error 1624 if err != nil { 1625 return err 1626 } 1627 1628 if !fi.Mode().IsRegular() { 1629 return nil 1630 } 1631 1632 header, err := tar.FileInfoHeader(fi, fi.Name()) 1633 if err != nil { 1634 return err 1635 } 1636 1637 // remove leading path to the src, so files are relative to the archive 1638 path := strings.ReplaceAll(file, src, "") 1639 if target != "" { 1640 path = filepath.Join([]string{target, path}...) 1641 } 1642 path = strings.TrimPrefix(path, string(filepath.Separator)) 1643 1644 header.Name = path 1645 1646 if err := tw.WriteHeader(header); err != nil { 1647 return err 1648 } 1649 1650 // copy the file contents 1651 f, err := os.Open(file) 1652 if err != nil { 1653 return err 1654 } 1655 1656 if _, err := io.Copy(tw, f); err != nil { 1657 return err 1658 } 1659 1660 f.Close() 1661 1662 return nil 1663 }) 1664 } 1665 1666 // filterServerMembers returns a slice of server member names matching the search criteria 1667 func filterServerMembers(serverMembers *api.ServerMembers, serverIDs string, region string) (membersFound []string, err error) { 1668 if serverMembers.Members == nil { 1669 return nil, fmt.Errorf("Failed to parse server members, members==nil") 1670 } 1671 1672 prefixes := stringToSlice(serverIDs) 1673 1674 // "leader" is a special case which Nomad handles in the API. If "leader" 1675 // appears in serverIDs, add it to membersFound and remove it from the list 1676 // so that it isn't processed by the range loop 1677 if slices.Contains(prefixes, "leader") { 1678 membersFound = append(membersFound, "leader") 1679 helper.RemoveEqualFold(&prefixes, "leader") 1680 } 1681 1682 for _, member := range serverMembers.Members { 1683 // If region is provided it must match exactly 1684 if region != "" && member.Tags["region"] != region { 1685 continue 1686 } 1687 1688 // Always include "all" 1689 if serverIDs == "all" { 1690 membersFound = append(membersFound, member.Name) 1691 continue 1692 } 1693 1694 // Include member if name matches any prefix from serverIDs 1695 if helper.StringHasPrefixInSlice(member.Name, prefixes) { 1696 membersFound = append(membersFound, member.Name) 1697 } 1698 } 1699 1700 return membersFound, nil 1701 } 1702 1703 // stringToSlice splits comma-separated input string into slice, trims 1704 // whitespace, and prunes empty values 1705 func stringToSlice(input string) []string { 1706 ns := strings.Split(input, ",") 1707 var out []string 1708 for _, n := range ns { 1709 s := strings.TrimSpace(n) 1710 if s == "" { 1711 continue 1712 } 1713 out = append(out, s) 1714 } 1715 return out 1716 } 1717 1718 func parseEventTopics(topicList []string) (map[api.Topic][]string, error) { 1719 topics := make(map[api.Topic][]string) 1720 1721 var mErrs *multierror.Error 1722 1723 for _, topic := range topicList { 1724 k, v, err := parseTopic(topic) 1725 if err != nil { 1726 mErrs = multierror.Append(mErrs, err) 1727 } 1728 1729 topics[api.Topic(k)] = append(topics[api.Topic(k)], v) 1730 } 1731 1732 return topics, mErrs.ErrorOrNil() 1733 } 1734 1735 func parseTopic(input string) (string, string, error) { 1736 var topic, filter string 1737 1738 parts := strings.Split(input, ":") 1739 switch len(parts) { 1740 case 1: 1741 // infer wildcard if only given a topic 1742 topic = input 1743 filter = "*" 1744 case 2: 1745 topic = parts[0] 1746 filter = parts[1] 1747 default: 1748 return "", "", fmt.Errorf("Invalid key value pair for topic: %s", topic) 1749 } 1750 1751 return strings.Title(topic), filter, nil 1752 } 1753 1754 func allTopics() map[api.Topic][]string { 1755 return map[api.Topic][]string{"*": {"*"}} 1756 } 1757 1758 // topicsFromString parses a comma separated list into a topicMap 1759 func topicsFromString(topicList string) (map[api.Topic][]string, error) { 1760 if topicList == "none" { 1761 return nil, nil 1762 } 1763 if topicList == "all" { 1764 return allTopics(), nil 1765 } 1766 1767 topics := stringToSlice(topicList) 1768 topicMap, err := parseEventTopics(topics) 1769 if err != nil { 1770 return nil, err 1771 } 1772 return topicMap, nil 1773 } 1774 1775 // external holds address configuration for Consul and Vault APIs 1776 type external struct { 1777 tls *api.TLSConfig 1778 addrVal string 1779 auth string 1780 ssl bool 1781 tokenVal string 1782 tokenFile string 1783 } 1784 1785 func (e *external) addr(defaultAddr string) string { 1786 if e.addrVal == "" { 1787 return defaultAddr 1788 } 1789 1790 // Return address as-is if it contains a protocol 1791 if strings.Contains(e.addrVal, "://") { 1792 return e.addrVal 1793 } 1794 1795 if e.ssl { 1796 return "https://" + e.addrVal 1797 } 1798 1799 return "http://" + e.addrVal 1800 } 1801 1802 func (e *external) setAddr(addr string) { 1803 // Handle no protocol scenario first 1804 if !strings.Contains(addr, "://") { 1805 e.addrVal = "http://" + addr 1806 if e.ssl { 1807 e.addrVal = "https://" + addr 1808 } 1809 return 1810 } 1811 1812 // Set SSL boolean based on protocol 1813 e.ssl = false 1814 if strings.Contains(addr, "https") { 1815 e.ssl = true 1816 } 1817 e.addrVal = addr 1818 } 1819 1820 func (e *external) token() string { 1821 if e.tokenVal != "" { 1822 return e.tokenVal 1823 } 1824 1825 if e.tokenFile != "" { 1826 bs, err := os.ReadFile(e.tokenFile) 1827 if err == nil { 1828 return strings.TrimSpace(string(bs)) 1829 } 1830 } 1831 1832 return "" 1833 } 1834 1835 func (c *OperatorDebugCommand) getConsulAddrFromSelf(self *api.AgentSelf) string { 1836 if self == nil { 1837 return "" 1838 } 1839 1840 var consulAddr string 1841 r, ok := self.Config["Consul"] 1842 if ok { 1843 m, ok := r.(map[string]interface{}) 1844 if ok { 1845 raw := m["EnableSSL"] 1846 c.consul.ssl, _ = raw.(bool) 1847 raw = m["Addr"] 1848 c.consul.setAddr(raw.(string)) 1849 raw = m["Auth"] 1850 c.consul.auth, _ = raw.(string) 1851 raw = m["Token"] 1852 c.consul.tokenVal = raw.(string) 1853 1854 consulAddr = c.consul.addr("") 1855 } 1856 } 1857 return consulAddr 1858 } 1859 1860 func (c *OperatorDebugCommand) getVaultAddrFromSelf(self *api.AgentSelf) string { 1861 if self == nil { 1862 return "" 1863 } 1864 1865 var vaultAddr string 1866 r, ok := self.Config["Vault"] 1867 if ok { 1868 m, ok := r.(map[string]interface{}) 1869 if ok { 1870 raw := m["EnableSSL"] 1871 c.vault.ssl, _ = raw.(bool) 1872 raw = m["Addr"] 1873 c.vault.setAddr(raw.(string)) 1874 raw = m["Auth"] 1875 c.vault.auth, _ = raw.(string) 1876 raw = m["Token"] 1877 c.vault.tokenVal = raw.(string) 1878 1879 vaultAddr = c.vault.addr("") 1880 } 1881 } 1882 return vaultAddr 1883 } 1884 1885 // defaultHttpClient configures a basic httpClient 1886 func defaultHttpClient() *http.Client { 1887 httpClient := cleanhttp.DefaultClient() 1888 transport := httpClient.Transport.(*http.Transport) 1889 transport.TLSHandshakeTimeout = 10 * time.Second 1890 transport.TLSClientConfig = &tls.Config{ 1891 MinVersion: tls.VersionTLS12, 1892 } 1893 1894 return httpClient 1895 } 1896 1897 // isRedirectError returns true if an error is a redirect error. 1898 func isRedirectError(err error) bool { 1899 if err == nil { 1900 return false 1901 } 1902 1903 const redirectErr string = `invalid character '<' looking for beginning of value` 1904 return strings.Contains(err.Error(), redirectErr) 1905 } 1906 1907 // getNomadVersion fetches the version of Nomad running on a given server/client node ID 1908 func (c *OperatorDebugCommand) getNomadVersion(serverID string, nodeID string) string { 1909 if serverID == "" && nodeID == "" { 1910 return "" 1911 } 1912 1913 version := "" 1914 if serverID != "" { 1915 for _, server := range c.members.Members { 1916 // Raft v2 server 1917 if server.Name == serverID { 1918 version = server.Tags["build"] 1919 } 1920 1921 // Raft v3 server 1922 if server.Tags["id"] == serverID { 1923 version = server.Tags["version"] 1924 } 1925 } 1926 } 1927 1928 if nodeID != "" { 1929 for _, node := range c.nodes { 1930 if node.ID == nodeID { 1931 version = node.Version 1932 } 1933 } 1934 } 1935 1936 return version 1937 } 1938 1939 // checkVersion verifies that version satisfies the constraint 1940 func checkVersion(version string, versionConstraint string) error { 1941 v, err := goversion.NewVersion(version) 1942 if err != nil { 1943 return fmt.Errorf("error: %v", err) 1944 } 1945 1946 c, err := goversion.NewConstraint(versionConstraint) 1947 if err != nil { 1948 return fmt.Errorf("error: %v", err) 1949 } 1950 1951 if !c.Check(v) { 1952 return nil 1953 } 1954 return fmt.Errorf("unsupported version=%s matches version filter %s", version, minimumVersionPprofConstraint) 1955 }