github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/command/operator_debug.go (about)

     1  package command
     2  
     3  import (
     4  	"archive/tar"
     5  	"compress/gzip"
     6  	"context"
     7  	"crypto/tls"
     8  	"encoding/json"
     9  	"fmt"
    10  	"html/template"
    11  	"io"
    12  	"io/ioutil"
    13  	"net/http"
    14  	"os"
    15  	"os/signal"
    16  	"path/filepath"
    17  	"strconv"
    18  	"strings"
    19  	"syscall"
    20  	"time"
    21  
    22  	"github.com/hashicorp/go-cleanhttp"
    23  	"github.com/hashicorp/nomad/api"
    24  	"github.com/hashicorp/nomad/helper"
    25  	"github.com/hashicorp/nomad/nomad/structs"
    26  	"github.com/posener/complete"
    27  )
    28  
    29  type OperatorDebugCommand struct {
    30  	Meta
    31  
    32  	timestamp     string
    33  	collectDir    string
    34  	duration      time.Duration
    35  	interval      time.Duration
    36  	pprofDuration time.Duration
    37  	logLevel      string
    38  	stale         bool
    39  	maxNodes      int
    40  	nodeClass     string
    41  	nodeIDs       []string
    42  	serverIDs     []string
    43  	consul        *external
    44  	vault         *external
    45  	manifest      []string
    46  	ctx           context.Context
    47  	cancel        context.CancelFunc
    48  }
    49  
    50  const (
    51  	userAgent = "nomad operator debug"
    52  )
    53  
    54  func (c *OperatorDebugCommand) Help() string {
    55  	helpText := `
    56  Usage: nomad operator debug [options]
    57  
    58    Build an archive containing Nomad cluster configuration and state, and Consul and Vault
    59    status. Include logs and pprof profiles for selected servers and client nodes.
    60  
    61    If ACLs are enabled, this command will require a token with the 'node:read'
    62    capability to run. In order to collect information, the token will also
    63    require the 'agent:read' and 'operator:read' capabilities, as well as the
    64    'list-jobs' capability for all namespaces. To collect pprof profiles the
    65    token will also require 'agent:write', or enable_debug configuration set to true.
    66  
    67  General Options:
    68  
    69    ` + generalOptionsUsage(usageOptsDefault|usageOptsNoNamespace) + `
    70  
    71  Debug Options:
    72  
    73    -duration=<duration>
    74      The duration of the log monitor command. Defaults to 2m.
    75  
    76    -interval=<interval>
    77      The interval between snapshots of the Nomad state. If unspecified, only one snapshot is
    78      captured.
    79  
    80    -log-level=<level>
    81      The log level to monitor. Defaults to DEBUG.
    82  
    83    -max-nodes=<count>
    84      Cap the maximum number of client nodes included in the capture.  Defaults to 10, set to 0 for unlimited.
    85  
    86    -node-id=<node>,<node>
    87      Comma separated list of Nomad client node ids, to monitor for logs and include pprof
    88      profiles. Accepts id prefixes, and "all" to select all nodes (up to count = max-nodes).
    89  
    90    -node-class=<node-class>
    91      Filter client nodes based on node class.
    92  
    93    -pprof-duration=<duration>
    94      Duration for pprof collection. Defaults to 1s.
    95  
    96    -server-id=<server>,<server>
    97      Comma separated list of Nomad server names, "leader", or "all" to monitor for logs and include pprof
    98      profiles.
    99  
   100    -stale=<true|false>
   101      If "false", the default, get membership data from the cluster leader. If the cluster is in
   102      an outage unable to establish leadership, it may be necessary to get the configuration from
   103      a non-leader server.
   104  
   105    -output=<path>
   106      Path to the parent directory of the output directory. If not specified, an archive is built
   107      in the current directory.
   108  
   109    -consul-http-addr=<addr>
   110      The address and port of the Consul HTTP agent. Overrides the CONSUL_HTTP_ADDR environment variable.
   111  
   112    -consul-token=<token>
   113      Token used to query Consul. Overrides the CONSUL_HTTP_TOKEN environment
   114      variable and the Consul token file.
   115  
   116    -consul-token-file=<path>
   117      Path to the Consul token file. Overrides the CONSUL_HTTP_TOKEN_FILE
   118      environment variable.
   119  
   120    -consul-client-cert=<path>
   121      Path to the Consul client cert file. Overrides the CONSUL_CLIENT_CERT
   122      environment variable.
   123  
   124    -consul-client-key=<path>
   125      Path to the Consul client key file. Overrides the CONSUL_CLIENT_KEY
   126      environment variable.
   127  
   128    -consul-ca-cert=<path>
   129      Path to a CA file to use with Consul. Overrides the CONSUL_CACERT
   130      environment variable and the Consul CA path.
   131  
   132    -consul-ca-path=<path>
   133      Path to a directory of PEM encoded CA cert files to verify the Consul
   134      certificate. Overrides the CONSUL_CAPATH environment variable.
   135  
   136    -vault-address=<addr>
   137      The address and port of the Vault HTTP agent. Overrides the VAULT_ADDR
   138      environment variable.
   139  
   140    -vault-token=<token>
   141      Token used to query Vault. Overrides the VAULT_TOKEN environment
   142      variable.
   143  
   144    -vault-client-cert=<path>
   145      Path to the Vault client cert file. Overrides the VAULT_CLIENT_CERT
   146      environment variable.
   147  
   148    -vault-client-key=<path>
   149      Path to the Vault client key file. Overrides the VAULT_CLIENT_KEY
   150      environment variable.
   151  
   152    -vault-ca-cert=<path>
   153      Path to a CA file to use with Vault. Overrides the VAULT_CACERT
   154      environment variable and the Vault CA path.
   155  
   156    -vault-ca-path=<path>
   157      Path to a directory of PEM encoded CA cert files to verify the Vault
   158      certificate. Overrides the VAULT_CAPATH environment variable.
   159  `
   160  	return strings.TrimSpace(helpText)
   161  }
   162  
   163  func (c *OperatorDebugCommand) Synopsis() string {
   164  	return "Build a debug archive"
   165  }
   166  
   167  func (c *OperatorDebugCommand) AutocompleteFlags() complete.Flags {
   168  	return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient),
   169  		complete.Flags{
   170  			"-duration":       complete.PredictAnything,
   171  			"-interval":       complete.PredictAnything,
   172  			"-log-level":      complete.PredictAnything,
   173  			"-max-nodes":      complete.PredictAnything,
   174  			"-node-class":     complete.PredictAnything,
   175  			"-node-id":        complete.PredictAnything,
   176  			"-server-id":      complete.PredictAnything,
   177  			"-output":         complete.PredictAnything,
   178  			"-pprof-duration": complete.PredictAnything,
   179  			"-consul-token":   complete.PredictAnything,
   180  			"-vault-token":    complete.PredictAnything,
   181  		})
   182  }
   183  
   184  func (c *OperatorDebugCommand) AutocompleteArgs() complete.Predictor {
   185  	return complete.PredictNothing
   186  }
   187  
   188  func (c *OperatorDebugCommand) Name() string { return "debug" }
   189  
   190  func (c *OperatorDebugCommand) Run(args []string) int {
   191  	flags := c.Meta.FlagSet(c.Name(), FlagSetClient)
   192  	flags.Usage = func() { c.Ui.Output(c.Help()) }
   193  
   194  	var duration, interval, output, pprofDuration string
   195  	var nodeIDs, serverIDs string
   196  
   197  	flags.StringVar(&duration, "duration", "2m", "")
   198  	flags.StringVar(&interval, "interval", "2m", "")
   199  	flags.StringVar(&c.logLevel, "log-level", "DEBUG", "")
   200  	flags.IntVar(&c.maxNodes, "max-nodes", 10, "")
   201  	flags.StringVar(&c.nodeClass, "node-class", "", "")
   202  	flags.StringVar(&nodeIDs, "node-id", "", "")
   203  	flags.StringVar(&serverIDs, "server-id", "", "")
   204  	flags.BoolVar(&c.stale, "stale", false, "")
   205  	flags.StringVar(&output, "output", "", "")
   206  	flags.StringVar(&pprofDuration, "pprof-duration", "1s", "")
   207  
   208  	c.consul = &external{tls: &api.TLSConfig{}}
   209  	flags.StringVar(&c.consul.addrVal, "consul-http-addr", os.Getenv("CONSUL_HTTP_ADDR"), "")
   210  	ssl := os.Getenv("CONSUL_HTTP_SSL")
   211  	c.consul.ssl, _ = strconv.ParseBool(ssl)
   212  	flags.StringVar(&c.consul.auth, "consul-auth", os.Getenv("CONSUL_HTTP_AUTH"), "")
   213  	flags.StringVar(&c.consul.tokenVal, "consul-token", os.Getenv("CONSUL_HTTP_TOKEN"), "")
   214  	flags.StringVar(&c.consul.tokenFile, "consul-token-file", os.Getenv("CONSUL_HTTP_TOKEN_FILE"), "")
   215  	flags.StringVar(&c.consul.tls.ClientCert, "consul-client-cert", os.Getenv("CONSUL_CLIENT_CERT"), "")
   216  	flags.StringVar(&c.consul.tls.ClientKey, "consul-client-key", os.Getenv("CONSUL_CLIENT_KEY"), "")
   217  	flags.StringVar(&c.consul.tls.CACert, "consul-ca-cert", os.Getenv("CONSUL_CACERT"), "")
   218  	flags.StringVar(&c.consul.tls.CAPath, "consul-ca-path", os.Getenv("CONSUL_CAPATH"), "")
   219  
   220  	c.vault = &external{tls: &api.TLSConfig{}}
   221  	flags.StringVar(&c.vault.addrVal, "vault-address", os.Getenv("VAULT_ADDR"), "")
   222  	flags.StringVar(&c.vault.tokenVal, "vault-token", os.Getenv("VAULT_TOKEN"), "")
   223  	flags.StringVar(&c.vault.tls.CACert, "vault-ca-cert", os.Getenv("VAULT_CACERT"), "")
   224  	flags.StringVar(&c.vault.tls.CAPath, "vault-ca-path", os.Getenv("VAULT_CAPATH"), "")
   225  	flags.StringVar(&c.vault.tls.ClientCert, "vault-client-cert", os.Getenv("VAULT_CLIENT_CERT"), "")
   226  	flags.StringVar(&c.vault.tls.ClientKey, "vault-client-key", os.Getenv("VAULT_CLIENT_KEY"), "")
   227  
   228  	if err := flags.Parse(args); err != nil {
   229  		c.Ui.Error(fmt.Sprintf("Error parsing arguments: %q", err))
   230  		return 1
   231  	}
   232  
   233  	// Parse the capture duration
   234  	d, err := time.ParseDuration(duration)
   235  	if err != nil {
   236  		c.Ui.Error(fmt.Sprintf("Error parsing duration: %s: %s", duration, err.Error()))
   237  		return 1
   238  	}
   239  	c.duration = d
   240  
   241  	// Parse the capture interval
   242  	i, err := time.ParseDuration(interval)
   243  	if err != nil {
   244  		c.Ui.Error(fmt.Sprintf("Error parsing interval: %s: %s", interval, err.Error()))
   245  		return 1
   246  	}
   247  	c.interval = i
   248  
   249  	// Parse the pprof capture duration
   250  	pd, err := time.ParseDuration(pprofDuration)
   251  	if err != nil {
   252  		c.Ui.Error(fmt.Sprintf("Error parsing pprof duration: %s: %s", pprofDuration, err.Error()))
   253  		return 1
   254  	}
   255  	c.pprofDuration = pd
   256  
   257  	// Verify there are no extra arguments
   258  	args = flags.Args()
   259  	if l := len(args); l != 0 {
   260  		c.Ui.Error("This command takes no arguments")
   261  		c.Ui.Error(commandErrorText(c))
   262  		return 1
   263  	}
   264  
   265  	// Initialize capture variables and structs
   266  	c.manifest = make([]string, 0)
   267  	ctx, cancel := context.WithCancel(context.Background())
   268  	c.ctx = ctx
   269  	c.cancel = cancel
   270  	c.trap()
   271  
   272  	// Generate timestamped file name
   273  	format := "2006-01-02-150405Z"
   274  	c.timestamp = time.Now().UTC().Format(format)
   275  	stamped := "nomad-debug-" + c.timestamp
   276  
   277  	// Create the output directory
   278  	var tmp string
   279  	if output != "" {
   280  		// User specified output directory
   281  		tmp = filepath.Join(output, stamped)
   282  		_, err := os.Stat(tmp)
   283  		if !os.IsNotExist(err) {
   284  			c.Ui.Error("Output directory already exists")
   285  			return 2
   286  		}
   287  	} else {
   288  		// Generate temp directory
   289  		tmp, err = ioutil.TempDir(os.TempDir(), stamped)
   290  		if err != nil {
   291  			c.Ui.Error(fmt.Sprintf("Error creating tmp directory: %s", err.Error()))
   292  			return 2
   293  		}
   294  		defer os.RemoveAll(tmp)
   295  	}
   296  
   297  	c.collectDir = tmp
   298  
   299  	// Create an instance of the API client
   300  	client, err := c.Meta.Client()
   301  	if err != nil {
   302  		c.Ui.Error(fmt.Sprintf("Error initializing client: %s", err.Error()))
   303  		return 1
   304  	}
   305  
   306  	// Search all nodes If a node class is specified without a list of node id prefixes
   307  	if c.nodeClass != "" && nodeIDs == "" {
   308  		nodeIDs = "all"
   309  	}
   310  
   311  	// Resolve client node id prefixes
   312  	nodesFound := 0
   313  	nodeLookupFailCount := 0
   314  	nodeCaptureCount := 0
   315  
   316  	for _, id := range argNodes(nodeIDs) {
   317  		if id == "all" {
   318  			// Capture from all nodes using empty prefix filter
   319  			id = ""
   320  		} else {
   321  			// Capture from nodes starting with prefix id
   322  			id = sanitizeUUIDPrefix(id)
   323  		}
   324  		nodes, _, err := client.Nodes().PrefixList(id)
   325  		if err != nil {
   326  			c.Ui.Error(fmt.Sprintf("Error querying node info: %s", err))
   327  			return 1
   328  		}
   329  
   330  		// Increment fail count if no nodes are found
   331  		if len(nodes) == 0 {
   332  			c.Ui.Error(fmt.Sprintf("No node(s) with prefix %q found", id))
   333  			nodeLookupFailCount++
   334  			continue
   335  		}
   336  
   337  		nodesFound += len(nodes)
   338  
   339  		// Apply constraints to nodes found
   340  		for _, n := range nodes {
   341  			// Ignore nodes that do not match specified class
   342  			if c.nodeClass != "" && n.NodeClass != c.nodeClass {
   343  				continue
   344  			}
   345  
   346  			// Add node to capture list
   347  			c.nodeIDs = append(c.nodeIDs, n.ID)
   348  			nodeCaptureCount++
   349  
   350  			// Stop looping when we reach the max
   351  			if c.maxNodes != 0 && nodeCaptureCount >= c.maxNodes {
   352  				break
   353  			}
   354  		}
   355  	}
   356  
   357  	// Return error if nodes were specified but none were found
   358  	if len(nodeIDs) > 0 && nodeCaptureCount == 0 {
   359  		c.Ui.Error(fmt.Sprintf("Failed to retrieve clients, 0 nodes found in list: %s", nodeIDs))
   360  		return 1
   361  	}
   362  
   363  	// Resolve servers
   364  	members, err := client.Agent().Members()
   365  	if err != nil {
   366  		c.Ui.Error(fmt.Sprintf("Failed to retrieve server list; err: %v", err))
   367  		return 1
   368  	}
   369  	c.writeJSON("version", "members.json", members, err)
   370  	// We always write the error to the file, but don't range if no members found
   371  	if serverIDs == "all" && members != nil {
   372  		// Special case to capture from all servers
   373  		for _, member := range members.Members {
   374  			c.serverIDs = append(c.serverIDs, member.Name)
   375  		}
   376  	} else {
   377  		c.serverIDs = append(c.serverIDs, argNodes(serverIDs)...)
   378  	}
   379  
   380  	serversFound := 0
   381  	serverCaptureCount := 0
   382  
   383  	if members != nil {
   384  		serversFound = len(members.Members)
   385  	}
   386  	if c.serverIDs != nil {
   387  		serverCaptureCount = len(c.serverIDs)
   388  	}
   389  
   390  	// Return error if servers were specified but not found
   391  	if len(serverIDs) > 0 && serverCaptureCount == 0 {
   392  		c.Ui.Error(fmt.Sprintf("Failed to retrieve servers, 0 members found in list: %s", serverIDs))
   393  		return 1
   394  	}
   395  
   396  	// Display general info about the capture
   397  	c.Ui.Output("Starting debugger...")
   398  	c.Ui.Output("")
   399  	c.Ui.Output(fmt.Sprintf("          Servers: (%d/%d) %v", serverCaptureCount, serversFound, c.serverIDs))
   400  	c.Ui.Output(fmt.Sprintf("          Clients: (%d/%d) %v", nodeCaptureCount, nodesFound, c.nodeIDs))
   401  	if nodeCaptureCount > 0 && nodeCaptureCount == c.maxNodes {
   402  		c.Ui.Output(fmt.Sprintf("                   Max node count reached (%d)", c.maxNodes))
   403  	}
   404  	if nodeLookupFailCount > 0 {
   405  		c.Ui.Output(fmt.Sprintf("Client fail count: %v", nodeLookupFailCount))
   406  	}
   407  	if c.nodeClass != "" {
   408  		c.Ui.Output(fmt.Sprintf("       Node Class: %s", c.nodeClass))
   409  	}
   410  	c.Ui.Output(fmt.Sprintf("         Interval: %s", interval))
   411  	c.Ui.Output(fmt.Sprintf("         Duration: %s", duration))
   412  	if c.pprofDuration.Seconds() != 1 {
   413  		c.Ui.Output(fmt.Sprintf("   pprof Duration: %s", c.pprofDuration))
   414  	}
   415  	c.Ui.Output("")
   416  	c.Ui.Output("Capturing cluster data...")
   417  
   418  	// Start collecting data
   419  	err = c.collect(client)
   420  	if err != nil {
   421  		c.Ui.Error(fmt.Sprintf("Error collecting data: %s", err.Error()))
   422  		return 2
   423  	}
   424  
   425  	// Write index json/html manifest files
   426  	c.writeManifest()
   427  
   428  	// Exit before archive if output directory was specified
   429  	if output != "" {
   430  		c.Ui.Output(fmt.Sprintf("Created debug directory: %s", c.collectDir))
   431  		return 0
   432  	}
   433  
   434  	// Create archive tarball
   435  	archiveFile := stamped + ".tar.gz"
   436  	err = TarCZF(archiveFile, tmp, stamped)
   437  	if err != nil {
   438  		c.Ui.Error(fmt.Sprintf("Error creating archive: %s", err.Error()))
   439  		return 2
   440  	}
   441  
   442  	// Final output with name of tarball
   443  	c.Ui.Output(fmt.Sprintf("Created debug archive: %s", archiveFile))
   444  	return 0
   445  }
   446  
   447  // collect collects data from our endpoints and writes the archive bundle
   448  func (c *OperatorDebugCommand) collect(client *api.Client) error {
   449  	// Version contains cluster meta information
   450  	dir := "version"
   451  
   452  	self, err := client.Agent().Self()
   453  	c.writeJSON(dir, "agent-self.json", self, err)
   454  
   455  	// Fetch data directly from consul and vault. Ignore errors
   456  	var consul, vault string
   457  
   458  	if self != nil {
   459  		r, ok := self.Config["Consul"]
   460  		if ok {
   461  			m, ok := r.(map[string]interface{})
   462  			if ok {
   463  
   464  				raw := m["Addr"]
   465  				consul, _ = raw.(string)
   466  				raw = m["EnableSSL"]
   467  				ssl, _ := raw.(bool)
   468  				if ssl {
   469  					consul = "https://" + consul
   470  				} else {
   471  					consul = "http://" + consul
   472  				}
   473  			}
   474  		}
   475  
   476  		r, ok = self.Config["Vault"]
   477  		if ok {
   478  			m, ok := r.(map[string]interface{})
   479  			if ok {
   480  				raw := m["Addr"]
   481  				vault, _ = raw.(string)
   482  			}
   483  		}
   484  	}
   485  
   486  	c.collectConsul(dir, consul)
   487  	c.collectVault(dir, vault)
   488  	c.collectAgentHosts(client)
   489  	c.collectPprofs(client)
   490  
   491  	c.startMonitors(client)
   492  	c.collectPeriodic(client)
   493  
   494  	return nil
   495  }
   496  
   497  // path returns platform specific paths in the tmp root directory
   498  func (c *OperatorDebugCommand) path(paths ...string) string {
   499  	ps := []string{c.collectDir}
   500  	ps = append(ps, paths...)
   501  	return filepath.Join(ps...)
   502  }
   503  
   504  // mkdir creates directories in the tmp root directory
   505  func (c *OperatorDebugCommand) mkdir(paths ...string) error {
   506  	joinedPath := c.path(paths...)
   507  
   508  	// Ensure path doesn't escape the sandbox of the capture directory
   509  	escapes := helper.PathEscapesSandbox(c.collectDir, joinedPath)
   510  	if escapes {
   511  		return fmt.Errorf("file path escapes capture directory")
   512  	}
   513  
   514  	return os.MkdirAll(joinedPath, 0755)
   515  }
   516  
   517  // startMonitors starts go routines for each node and client
   518  func (c *OperatorDebugCommand) startMonitors(client *api.Client) {
   519  	for _, id := range c.nodeIDs {
   520  		go c.startMonitor("client", "node_id", id, client)
   521  	}
   522  
   523  	for _, id := range c.serverIDs {
   524  		go c.startMonitor("server", "server_id", id, client)
   525  	}
   526  }
   527  
   528  // startMonitor starts one monitor api request, writing to a file. It blocks and should be
   529  // called in a go routine. Errors are ignored, we want to build the archive even if a node
   530  // is unavailable
   531  func (c *OperatorDebugCommand) startMonitor(path, idKey, nodeID string, client *api.Client) {
   532  	c.mkdir(path, nodeID)
   533  	fh, err := os.Create(c.path(path, nodeID, "monitor.log"))
   534  	if err != nil {
   535  		return
   536  	}
   537  	defer fh.Close()
   538  
   539  	qo := api.QueryOptions{
   540  		Params: map[string]string{
   541  			idKey:       nodeID,
   542  			"log_level": c.logLevel,
   543  		},
   544  	}
   545  
   546  	outCh, errCh := client.Agent().Monitor(c.ctx.Done(), &qo)
   547  	for {
   548  		select {
   549  		case out := <-outCh:
   550  			if out == nil {
   551  				continue
   552  			}
   553  			fh.Write(out.Data)
   554  			fh.WriteString("\n")
   555  
   556  		case err := <-errCh:
   557  			fh.WriteString(fmt.Sprintf("monitor: %s\n", err.Error()))
   558  			return
   559  
   560  		case <-c.ctx.Done():
   561  			return
   562  		}
   563  	}
   564  }
   565  
   566  // collectAgentHosts calls collectAgentHost for each selected node
   567  func (c *OperatorDebugCommand) collectAgentHosts(client *api.Client) {
   568  	for _, n := range c.nodeIDs {
   569  		c.collectAgentHost("client", n, client)
   570  	}
   571  
   572  	for _, n := range c.serverIDs {
   573  		c.collectAgentHost("server", n, client)
   574  	}
   575  }
   576  
   577  // collectAgentHost gets the agent host data
   578  func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Client) {
   579  	var host *api.HostDataResponse
   580  	var err error
   581  	if path == "server" {
   582  		host, err = client.Agent().Host(id, "", nil)
   583  	} else {
   584  		host, err = client.Agent().Host("", id, nil)
   585  	}
   586  
   587  	if err != nil {
   588  		c.Ui.Error(fmt.Sprintf("%s/%s: Failed to retrieve agent host data, err: %v", path, id, err))
   589  
   590  		if strings.Contains(err.Error(), structs.ErrPermissionDenied.Error()) {
   591  			// Drop a hint to help the operator resolve the error
   592  			c.Ui.Warn("Agent host retrieval requires agent:read ACL or enable_debug=true.  See https://www.nomadproject.io/api-docs/agent#host for more information.")
   593  		}
   594  		return // exit on any error
   595  	}
   596  
   597  	path = filepath.Join(path, id)
   598  	c.writeJSON(path, "agent-host.json", host, err)
   599  }
   600  
   601  // collectPprofs captures the /agent/pprof for each listed node
   602  func (c *OperatorDebugCommand) collectPprofs(client *api.Client) {
   603  	for _, n := range c.nodeIDs {
   604  		c.collectPprof("client", n, client)
   605  	}
   606  
   607  	for _, n := range c.serverIDs {
   608  		c.collectPprof("server", n, client)
   609  	}
   610  }
   611  
   612  // collectPprof captures pprof data for the node
   613  func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client) {
   614  	pprofDurationSeconds := int(c.pprofDuration.Seconds())
   615  	opts := api.PprofOptions{Seconds: pprofDurationSeconds}
   616  	if path == "server" {
   617  		opts.ServerID = id
   618  	} else {
   619  		opts.NodeID = id
   620  	}
   621  
   622  	path = filepath.Join(path, id)
   623  
   624  	bs, err := client.Agent().CPUProfile(opts, nil)
   625  	if err != nil {
   626  		c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof profile.prof, err: %v", path, err))
   627  		if structs.IsErrPermissionDenied(err) {
   628  			// All Profiles require the same permissions, so we only need to see
   629  			// one permission failure before we bail.
   630  			// But lets first drop a hint to help the operator resolve the error
   631  
   632  			c.Ui.Warn("Pprof retrieval requires agent:write ACL or enable_debug=true.  See https://www.nomadproject.io/api-docs/agent#agent-runtime-profiles for more information.")
   633  			return // only exit on 403
   634  		}
   635  	} else {
   636  		err := c.writeBytes(path, "profile.prof", bs)
   637  		if err != nil {
   638  			c.Ui.Error(err.Error())
   639  		}
   640  	}
   641  
   642  	bs, err = client.Agent().Trace(opts, nil)
   643  	if err != nil {
   644  		c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof trace.prof, err: %v", path, err))
   645  	} else {
   646  		err := c.writeBytes(path, "trace.prof", bs)
   647  		if err != nil {
   648  			c.Ui.Error(err.Error())
   649  		}
   650  	}
   651  
   652  	bs, err = client.Agent().Lookup("goroutine", opts, nil)
   653  	if err != nil {
   654  		c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof goroutine.prof, err: %v", path, err))
   655  	} else {
   656  		err := c.writeBytes(path, "goroutine.prof", bs)
   657  		if err != nil {
   658  			c.Ui.Error(err.Error())
   659  		}
   660  	}
   661  
   662  	// Gather goroutine text output - debug type 1
   663  	// debug type 1 writes the legacy text format for human readable output
   664  	opts.Debug = 1
   665  	bs, err = client.Agent().Lookup("goroutine", opts, nil)
   666  	if err != nil {
   667  		c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof goroutine-debug1.txt, err: %v", path, err))
   668  	} else {
   669  		err := c.writeBytes(path, "goroutine-debug1.txt", bs)
   670  		if err != nil {
   671  			c.Ui.Error(err.Error())
   672  		}
   673  	}
   674  
   675  	// Gather goroutine text output - debug type 2
   676  	// When printing the "goroutine" profile, debug=2 means to print the goroutine
   677  	// stacks in the same form that a Go program uses when dying due to an unrecovered panic.
   678  	opts.Debug = 2
   679  	bs, err = client.Agent().Lookup("goroutine", opts, nil)
   680  	if err != nil {
   681  		c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof goroutine-debug2.txt, err: %v", path, err))
   682  	} else {
   683  		err := c.writeBytes(path, "goroutine-debug2.txt", bs)
   684  		if err != nil {
   685  			c.Ui.Error(err.Error())
   686  		}
   687  	}
   688  }
   689  
   690  // collectPeriodic runs for duration, capturing the cluster state every interval. It flushes and stops
   691  // the monitor requests
   692  func (c *OperatorDebugCommand) collectPeriodic(client *api.Client) {
   693  	duration := time.After(c.duration)
   694  	// Set interval to 0 so that we immediately execute, wait the interval next time
   695  	interval := time.After(0 * time.Second)
   696  	var intervalCount int
   697  	var name, dir string
   698  
   699  	for {
   700  		select {
   701  		case <-duration:
   702  			c.cancel()
   703  			return
   704  
   705  		case <-interval:
   706  			name = fmt.Sprintf("%04d", intervalCount)
   707  			dir = filepath.Join("nomad", name)
   708  			c.Ui.Output(fmt.Sprintf("    Capture interval %s", name))
   709  			c.collectNomad(dir, client)
   710  			c.collectOperator(dir, client)
   711  			interval = time.After(c.interval)
   712  			intervalCount++
   713  
   714  		case <-c.ctx.Done():
   715  			return
   716  		}
   717  	}
   718  }
   719  
   720  // collectOperator captures some cluster meta information
   721  func (c *OperatorDebugCommand) collectOperator(dir string, client *api.Client) {
   722  	rc, err := client.Operator().RaftGetConfiguration(nil)
   723  	c.writeJSON(dir, "operator-raft.json", rc, err)
   724  
   725  	sc, _, err := client.Operator().SchedulerGetConfiguration(nil)
   726  	c.writeJSON(dir, "operator-scheduler.json", sc, err)
   727  
   728  	ah, _, err := client.Operator().AutopilotServerHealth(nil)
   729  	c.writeJSON(dir, "operator-autopilot-health.json", ah, err)
   730  
   731  	lic, _, err := client.Operator().LicenseGet(nil)
   732  	c.writeJSON(dir, "license.json", lic, err)
   733  }
   734  
   735  // collectNomad captures the nomad cluster state
   736  func (c *OperatorDebugCommand) collectNomad(dir string, client *api.Client) error {
   737  	var qo *api.QueryOptions
   738  
   739  	js, _, err := client.Jobs().List(qo)
   740  	c.writeJSON(dir, "jobs.json", js, err)
   741  
   742  	ds, _, err := client.Deployments().List(qo)
   743  	c.writeJSON(dir, "deployments.json", ds, err)
   744  
   745  	es, _, err := client.Evaluations().List(qo)
   746  	c.writeJSON(dir, "evaluations.json", es, err)
   747  
   748  	as, _, err := client.Allocations().List(qo)
   749  	c.writeJSON(dir, "allocations.json", as, err)
   750  
   751  	ns, _, err := client.Nodes().List(qo)
   752  	c.writeJSON(dir, "nodes.json", ns, err)
   753  
   754  	// CSI Plugins - /v1/plugins?type=csi
   755  	ps, _, err := client.CSIPlugins().List(qo)
   756  	c.writeJSON(dir, "plugins.json", ps, err)
   757  
   758  	// CSI Plugin details - /v1/plugin/csi/:plugin_id
   759  	for _, p := range ps {
   760  		csiPlugin, _, err := client.CSIPlugins().Info(p.ID, qo)
   761  		csiPluginFileName := fmt.Sprintf("csi-plugin-id-%s.json", p.ID)
   762  		c.writeJSON(dir, csiPluginFileName, csiPlugin, err)
   763  	}
   764  
   765  	// CSI Volumes - /v1/volumes?type=csi
   766  	csiVolumes, _, err := client.CSIVolumes().List(qo)
   767  	c.writeJSON(dir, "csi-volumes.json", csiVolumes, err)
   768  
   769  	// CSI Volume details - /v1/volumes/csi/:volume-id
   770  	for _, v := range csiVolumes {
   771  		csiVolume, _, err := client.CSIVolumes().Info(v.ID, qo)
   772  		csiFileName := fmt.Sprintf("csi-volume-id-%s.json", v.ID)
   773  		c.writeJSON(dir, csiFileName, csiVolume, err)
   774  	}
   775  
   776  	metrics, _, err := client.Operator().MetricsSummary(qo)
   777  	c.writeJSON(dir, "metrics.json", metrics, err)
   778  
   779  	return nil
   780  }
   781  
   782  // collectConsul calls the Consul API directly to collect data
   783  func (c *OperatorDebugCommand) collectConsul(dir, consul string) error {
   784  	addr := c.consul.addr(consul)
   785  	if addr == "" {
   786  		return nil
   787  	}
   788  
   789  	client := defaultHttpClient()
   790  	api.ConfigureTLS(client, c.consul.tls)
   791  
   792  	req, _ := http.NewRequest("GET", addr+"/v1/agent/self", nil)
   793  	req.Header.Add("X-Consul-Token", c.consul.token())
   794  	req.Header.Add("User-Agent", userAgent)
   795  	resp, err := client.Do(req)
   796  	c.writeBody(dir, "consul-agent-self.json", resp, err)
   797  
   798  	req, _ = http.NewRequest("GET", addr+"/v1/agent/members", nil)
   799  	req.Header.Add("X-Consul-Token", c.consul.token())
   800  	req.Header.Add("User-Agent", userAgent)
   801  	resp, err = client.Do(req)
   802  	c.writeBody(dir, "consul-agent-members.json", resp, err)
   803  
   804  	return nil
   805  }
   806  
   807  // collectVault calls the Vault API directly to collect data
   808  func (c *OperatorDebugCommand) collectVault(dir, vault string) error {
   809  	addr := c.vault.addr(vault)
   810  	if addr == "" {
   811  		return nil
   812  	}
   813  
   814  	client := defaultHttpClient()
   815  	api.ConfigureTLS(client, c.vault.tls)
   816  
   817  	req, _ := http.NewRequest("GET", addr+"/sys/health", nil)
   818  	req.Header.Add("X-Vault-Token", c.vault.token())
   819  	req.Header.Add("User-Agent", userAgent)
   820  	resp, err := client.Do(req)
   821  	c.writeBody(dir, "vault-sys-health.json", resp, err)
   822  
   823  	return nil
   824  }
   825  
   826  // writeBytes writes a file to the archive, recording it in the manifest
   827  func (c *OperatorDebugCommand) writeBytes(dir, file string, data []byte) error {
   828  	// Replace invalid characters in filename
   829  	filename := helper.CleanFilename(file, "_")
   830  
   831  	relativePath := filepath.Join(dir, filename)
   832  	c.manifest = append(c.manifest, relativePath)
   833  	dirPath := filepath.Join(c.collectDir, dir)
   834  	filePath := filepath.Join(dirPath, filename)
   835  
   836  	// Ensure parent directories exist
   837  	err := os.MkdirAll(dirPath, os.ModePerm)
   838  	if err != nil {
   839  		return fmt.Errorf("failed to create parent directories of \"%s\": %w", dirPath, err)
   840  	}
   841  
   842  	// Ensure filename doesn't escape the sandbox of the capture directory
   843  	escapes := helper.PathEscapesSandbox(c.collectDir, filePath)
   844  	if escapes {
   845  		return fmt.Errorf("file path \"%s\" escapes capture directory \"%s\"", filePath, c.collectDir)
   846  	}
   847  
   848  	// Create the file
   849  	fh, err := os.Create(filePath)
   850  	if err != nil {
   851  		return fmt.Errorf("failed to create file \"%s\", err: %w", filePath, err)
   852  	}
   853  	defer fh.Close()
   854  
   855  	_, err = fh.Write(data)
   856  	if err != nil {
   857  		return fmt.Errorf("Failed to write data to file \"%s\", err: %w", filePath, err)
   858  	}
   859  	return nil
   860  }
   861  
   862  // writeJSON writes JSON responses from the Nomad API calls to the archive
   863  func (c *OperatorDebugCommand) writeJSON(dir, file string, data interface{}, err error) error {
   864  	if err != nil {
   865  		return c.writeError(dir, file, err)
   866  	}
   867  	bytes, err := json.Marshal(data)
   868  	if err != nil {
   869  		return c.writeError(dir, file, err)
   870  	}
   871  	err = c.writeBytes(dir, file, bytes)
   872  	if err != nil {
   873  		c.Ui.Error(err.Error())
   874  	}
   875  	return nil
   876  }
   877  
   878  // writeError writes a JSON error object to capture errors in the debug bundle without
   879  // reporting
   880  func (c *OperatorDebugCommand) writeError(dir, file string, err error) error {
   881  	bytes, err := json.Marshal(errorWrapper{Error: err.Error()})
   882  	if err != nil {
   883  		return err
   884  	}
   885  	return c.writeBytes(dir, file, bytes)
   886  }
   887  
   888  type errorWrapper struct {
   889  	Error string
   890  }
   891  
   892  // writeBody is a helper that writes the body of an http.Response to the archive
   893  func (c *OperatorDebugCommand) writeBody(dir, file string, resp *http.Response, err error) {
   894  	if err != nil {
   895  		c.writeError(dir, file, err)
   896  		return
   897  	}
   898  
   899  	if resp.ContentLength == 0 {
   900  		return
   901  	}
   902  
   903  	defer resp.Body.Close()
   904  
   905  	body, err := ioutil.ReadAll(resp.Body)
   906  	if err != nil {
   907  		c.writeError(dir, file, err)
   908  		return
   909  	}
   910  
   911  	if err := c.writeBytes(dir, file, body); err != nil {
   912  		c.Ui.Error(err.Error())
   913  	}
   914  }
   915  
   916  // writeManifest creates the index files
   917  func (c *OperatorDebugCommand) writeManifest() error {
   918  	// Write the JSON
   919  	path := filepath.Join(c.collectDir, "index.json")
   920  	jsonFh, err := os.Create(path)
   921  	if err != nil {
   922  		return err
   923  	}
   924  	defer jsonFh.Close()
   925  
   926  	json.NewEncoder(jsonFh).Encode(c.manifest)
   927  
   928  	// Write the HTML
   929  	path = filepath.Join(c.collectDir, "index.html")
   930  	htmlFh, err := os.Create(path)
   931  	if err != nil {
   932  		return err
   933  	}
   934  	defer htmlFh.Close()
   935  
   936  	head, _ := template.New("head").Parse("<html><head><title>{{.}}</title></head>\n<body><h1>{{.}}</h1>\n<ul>")
   937  	line, _ := template.New("line").Parse("<li><a href=\"{{.}}\">{{.}}</a></li>\n")
   938  	if err != nil {
   939  		return fmt.Errorf("%v", err)
   940  	}
   941  	tail := "</ul></body></html>\n"
   942  
   943  	head.Execute(htmlFh, c.timestamp)
   944  	for _, f := range c.manifest {
   945  		line.Execute(htmlFh, f)
   946  	}
   947  	htmlFh.WriteString(tail)
   948  
   949  	return nil
   950  }
   951  
   952  // trap captures signals, and closes stopCh
   953  func (c *OperatorDebugCommand) trap() {
   954  	sigCh := make(chan os.Signal, 1)
   955  	signal.Notify(sigCh,
   956  		syscall.SIGHUP,
   957  		syscall.SIGINT,
   958  		syscall.SIGTERM,
   959  		syscall.SIGQUIT)
   960  
   961  	go func() {
   962  		<-sigCh
   963  		c.cancel()
   964  	}()
   965  }
   966  
   967  // TarCZF, like the tar command, recursively builds a gzip compressed tar archive from a
   968  // directory. If not empty, all files in the bundle are prefixed with the target path
   969  func TarCZF(archive string, src, target string) error {
   970  	// ensure the src actually exists before trying to tar it
   971  	if _, err := os.Stat(src); err != nil {
   972  		return fmt.Errorf("Unable to tar files - %v", err.Error())
   973  	}
   974  
   975  	// create the archive
   976  	fh, err := os.Create(archive)
   977  	if err != nil {
   978  		return err
   979  	}
   980  	defer fh.Close()
   981  
   982  	zz := gzip.NewWriter(fh)
   983  	defer zz.Close()
   984  
   985  	tw := tar.NewWriter(zz)
   986  	defer tw.Close()
   987  
   988  	// tar
   989  	return filepath.Walk(src, func(file string, fi os.FileInfo, err error) error {
   990  
   991  		// return on any error
   992  		if err != nil {
   993  			return err
   994  		}
   995  
   996  		if !fi.Mode().IsRegular() {
   997  			return nil
   998  		}
   999  
  1000  		header, err := tar.FileInfoHeader(fi, fi.Name())
  1001  		if err != nil {
  1002  			return err
  1003  		}
  1004  
  1005  		// remove leading path to the src, so files are relative to the archive
  1006  		path := strings.Replace(file, src, "", -1)
  1007  		if target != "" {
  1008  			path = filepath.Join([]string{target, path}...)
  1009  		}
  1010  		path = strings.TrimPrefix(path, string(filepath.Separator))
  1011  
  1012  		header.Name = path
  1013  
  1014  		if err := tw.WriteHeader(header); err != nil {
  1015  			return err
  1016  		}
  1017  
  1018  		// copy the file contents
  1019  		f, err := os.Open(file)
  1020  		if err != nil {
  1021  			return err
  1022  		}
  1023  
  1024  		if _, err := io.Copy(tw, f); err != nil {
  1025  			return err
  1026  		}
  1027  
  1028  		f.Close()
  1029  
  1030  		return nil
  1031  	})
  1032  }
  1033  
  1034  // argNodes splits node ids from the command line by ","
  1035  func argNodes(input string) []string {
  1036  	ns := strings.Split(input, ",")
  1037  	var out []string
  1038  	for _, n := range ns {
  1039  		s := strings.TrimSpace(n)
  1040  		if s == "" {
  1041  			continue
  1042  		}
  1043  		out = append(out, s)
  1044  	}
  1045  	return out
  1046  }
  1047  
  1048  // external holds address configuration for Consul and Vault APIs
  1049  type external struct {
  1050  	tls       *api.TLSConfig
  1051  	addrVal   string
  1052  	auth      string
  1053  	ssl       bool
  1054  	tokenVal  string
  1055  	tokenFile string
  1056  }
  1057  
  1058  func (e *external) addr(defaultAddr string) string {
  1059  	if e.addrVal == "" {
  1060  		return defaultAddr
  1061  	}
  1062  
  1063  	if !e.ssl {
  1064  		if strings.HasPrefix(e.addrVal, "http:") {
  1065  			return e.addrVal
  1066  		}
  1067  		if strings.HasPrefix(e.addrVal, "https:") {
  1068  			// Mismatch: e.ssl=false but addrVal is https
  1069  			return strings.ReplaceAll(e.addrVal, "https://", "http://")
  1070  		}
  1071  		return "http://" + e.addrVal
  1072  	}
  1073  
  1074  	if strings.HasPrefix(e.addrVal, "https:") {
  1075  		return e.addrVal
  1076  	}
  1077  
  1078  	if strings.HasPrefix(e.addrVal, "http:") {
  1079  		// Mismatch: e.ssl=true but addrVal is http
  1080  		return strings.ReplaceAll(e.addrVal, "http://", "https://")
  1081  	}
  1082  
  1083  	return "https://" + e.addrVal
  1084  }
  1085  
  1086  func (e *external) token() string {
  1087  	if e.tokenVal != "" {
  1088  		return e.tokenVal
  1089  	}
  1090  
  1091  	if e.tokenFile != "" {
  1092  		bs, err := ioutil.ReadFile(e.tokenFile)
  1093  		if err == nil {
  1094  			return strings.TrimSpace(string(bs))
  1095  		}
  1096  	}
  1097  
  1098  	return ""
  1099  }
  1100  
  1101  // defaultHttpClient configures a basic httpClient
  1102  func defaultHttpClient() *http.Client {
  1103  	httpClient := cleanhttp.DefaultClient()
  1104  	transport := httpClient.Transport.(*http.Transport)
  1105  	transport.TLSHandshakeTimeout = 10 * time.Second
  1106  	transport.TLSClientConfig = &tls.Config{
  1107  		MinVersion: tls.VersionTLS12,
  1108  	}
  1109  
  1110  	return httpClient
  1111  }