github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cli/zip.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package cli
    12  
    13  import (
    14  	"archive/zip"
    15  	"context"
    16  	"encoding/json"
    17  	"fmt"
    18  	"io"
    19  	"net"
    20  	"net/url"
    21  	"os"
    22  	"path/filepath"
    23  	"sort"
    24  	"strconv"
    25  	"strings"
    26  	"time"
    27  	"unicode"
    28  
    29  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
    30  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    31  	"github.com/cockroachdb/cockroach/pkg/server/serverpb"
    32  	"github.com/cockroachdb/cockroach/pkg/server/status/statuspb"
    33  	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
    34  	"github.com/cockroachdb/cockroach/pkg/util/contextutil"
    35  	"github.com/cockroachdb/cockroach/pkg/util/log"
    36  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    37  	"github.com/cockroachdb/errors"
    38  	"github.com/lib/pq"
    39  	"github.com/spf13/cobra"
    40  )
    41  
    42  var debugZipCmd = &cobra.Command{
    43  	Use:   "zip <file>",
    44  	Short: "gather cluster debug data into a zip file",
    45  	Long: `
    46  
    47  Gather cluster debug data into a zip file. Data includes cluster events, node
    48  liveness, node status, range status, node stack traces, node engine stats, log
    49  files, and SQL schema.
    50  
    51  Retrieval of per-node details (status, stack traces, range status, engine stats)
    52  requires the node to be live and operating properly. Retrieval of SQL data
    53  requires the cluster to be live.
    54  `,
    55  	Args: cobra.ExactArgs(1),
    56  	RunE: MaybeDecorateGRPCError(runDebugZip),
    57  }
    58  
    59  // Tables containing cluster-wide info that are collected in a debug zip.
    60  var debugZipTablesPerCluster = []string{
    61  	"crdb_internal.cluster_queries",
    62  	"crdb_internal.cluster_sessions",
    63  	"crdb_internal.cluster_settings",
    64  	"crdb_internal.cluster_transactions",
    65  
    66  	"crdb_internal.jobs",
    67  	"system.jobs",       // get the raw, restorable jobs records too.
    68  	"system.descriptor", // descriptors also contain job-like mutation state.
    69  	"system.namespace",
    70  	"system.namespace2", // TODO(sqlexec): consider removing in 20.2 or later.
    71  
    72  	"crdb_internal.kv_node_status",
    73  	"crdb_internal.kv_store_status",
    74  
    75  	"crdb_internal.schema_changes",
    76  	"crdb_internal.partitions",
    77  	"crdb_internal.zones",
    78  }
    79  
    80  // Tables collected from each node in a debug zip.
    81  var debugZipTablesPerNode = []string{
    82  	"crdb_internal.feature_usage",
    83  
    84  	"crdb_internal.gossip_alerts",
    85  	"crdb_internal.gossip_liveness",
    86  	"crdb_internal.gossip_network",
    87  	"crdb_internal.gossip_nodes",
    88  
    89  	"crdb_internal.leases",
    90  
    91  	"crdb_internal.node_build_info",
    92  	"crdb_internal.node_metrics",
    93  	"crdb_internal.node_queries",
    94  	"crdb_internal.node_runtime_info",
    95  	"crdb_internal.node_sessions",
    96  	"crdb_internal.node_statement_statistics",
    97  	"crdb_internal.node_transactions",
    98  	"crdb_internal.node_txn_stats",
    99  }
   100  
   101  // Override for the default SELECT * when dumping the table.
   102  var customSelectClause = map[string]string{
   103  	"system.jobs":       "*, to_hex(payload) AS hex_payload, to_hex(progress) AS hex_progress",
   104  	"system.descriptor": "*, to_hex(descriptor) AS hex_descriptor",
   105  }
   106  
   107  type zipper struct {
   108  	f *os.File
   109  	z *zip.Writer
   110  }
   111  
   112  func newZipper(f *os.File) *zipper {
   113  	return &zipper{
   114  		f: f,
   115  		z: zip.NewWriter(f),
   116  	}
   117  }
   118  
   119  func (z *zipper) close() error {
   120  	err1 := z.z.Close()
   121  	err2 := z.f.Close()
   122  	return errors.CombineErrors(err1, err2)
   123  }
   124  
   125  func (z *zipper) create(name string, mtime time.Time) (io.Writer, error) {
   126  	fmt.Printf("writing: %s\n", name)
   127  	if mtime.IsZero() {
   128  		mtime = timeutil.Now()
   129  	}
   130  	return z.z.CreateHeader(&zip.FileHeader{
   131  		Name:     name,
   132  		Method:   zip.Deflate,
   133  		Modified: mtime,
   134  	})
   135  }
   136  
   137  func (z *zipper) createRaw(name string, b []byte) error {
   138  	w, err := z.create(name, time.Time{})
   139  	if err != nil {
   140  		return err
   141  	}
   142  	_, err = w.Write(b)
   143  	return err
   144  }
   145  
   146  func (z *zipper) createJSON(name string, m interface{}) error {
   147  	if !strings.HasSuffix(name, ".json") {
   148  		return errors.Errorf("%s does not have .json suffix", name)
   149  	}
   150  	b, err := json.MarshalIndent(m, "", "  ")
   151  	if err != nil {
   152  		return err
   153  	}
   154  	return z.createRaw(name, b)
   155  }
   156  
   157  func (z *zipper) createError(name string, e error) error {
   158  	w, err := z.create(name+".err.txt", time.Time{})
   159  	if err != nil {
   160  		return err
   161  	}
   162  	fmt.Printf("  ^- resulted in %s\n", e)
   163  	fmt.Fprintf(w, "%s\n", e)
   164  	return nil
   165  }
   166  
   167  func (z *zipper) createJSONOrError(name string, m interface{}, e error) error {
   168  	if e != nil {
   169  		return z.createError(name, e)
   170  	}
   171  	return z.createJSON(name, m)
   172  }
   173  
   174  func (z *zipper) createRawOrError(name string, b []byte, e error) error {
   175  	if filepath.Ext(name) == "" {
   176  		return errors.Errorf("%s has no extension", name)
   177  	}
   178  	if e != nil {
   179  		return z.createError(name, e)
   180  	}
   181  	return z.createRaw(name, b)
   182  }
   183  
   184  type zipRequest struct {
   185  	fn       func(ctx context.Context) (interface{}, error)
   186  	pathName string
   187  }
   188  
   189  func guessNodeURL(workingURL string, hostport string) *sqlConn {
   190  	u, err := url.Parse(workingURL)
   191  	if err != nil {
   192  		u = &url.URL{Host: "invalid"}
   193  	}
   194  	u.Host = hostport
   195  	return makeSQLConn(u.String())
   196  }
   197  
   198  func runZipRequestWithTimeout(
   199  	ctx context.Context,
   200  	requestName string,
   201  	timeout time.Duration,
   202  	fn func(ctx context.Context) error,
   203  ) error {
   204  	fmt.Printf("%s... ", requestName)
   205  	return contextutil.RunWithTimeout(ctx, requestName, timeout, fn)
   206  }
   207  
   208  func runDebugZip(cmd *cobra.Command, args []string) (retErr error) {
   209  	const (
   210  		base          = "debug"
   211  		eventsName    = base + "/events"
   212  		livenessName  = base + "/liveness"
   213  		nodesPrefix   = base + "/nodes"
   214  		rangelogName  = base + "/rangelog"
   215  		reportsPrefix = base + "/reports"
   216  		schemaPrefix  = base + "/schema"
   217  		settingsName  = base + "/settings"
   218  	)
   219  
   220  	baseCtx, cancel := context.WithCancel(context.Background())
   221  	defer cancel()
   222  
   223  	fmt.Printf("establishing RPC connection to %s...\n", serverCfg.AdvertiseAddr)
   224  	conn, _, finish, err := getClientGRPCConn(baseCtx, serverCfg)
   225  	if err != nil {
   226  		return err
   227  	}
   228  	defer finish()
   229  
   230  	status := serverpb.NewStatusClient(conn)
   231  	admin := serverpb.NewAdminClient(conn)
   232  
   233  	fmt.Println("retrieving the node status to get the SQL address...")
   234  	nodeD, err := status.Details(baseCtx, &serverpb.DetailsRequest{NodeId: "local"})
   235  	if err != nil {
   236  		return err
   237  	}
   238  	sqlAddr := nodeD.SQLAddress
   239  	if sqlAddr.IsEmpty() {
   240  		// No SQL address: either a pre-19.2 node, or same address for both
   241  		// SQL and RPC.
   242  		sqlAddr = nodeD.Address
   243  	}
   244  	fmt.Printf("using SQL address: %s\n", sqlAddr.AddressField)
   245  	cliCtx.clientConnHost, cliCtx.clientConnPort, err = net.SplitHostPort(sqlAddr.AddressField)
   246  	if err != nil {
   247  		return err
   248  	}
   249  
   250  	// We're going to use the SQL code, but in non-interactive mode.
   251  	// Override whatever terminal-driven defaults there may be out there.
   252  	cliCtx.isInteractive = false
   253  	cliCtx.terminalOutput = false
   254  	sqlCtx.showTimes = false
   255  	// Use a streaming format to avoid accumulating all rows in RAM.
   256  	cliCtx.tableDisplayFormat = tableDisplayTSV
   257  
   258  	sqlConn, err := makeSQLClient("cockroach zip", useSystemDb)
   259  	if err != nil {
   260  		log.Warningf(baseCtx, "unable to open a SQL session. Debug information will be incomplete: %s", err)
   261  	}
   262  	defer sqlConn.Close()
   263  	// Note: we're not printing "connection established" because the driver we're using
   264  	// does late binding.
   265  	if sqlConn != nil {
   266  		fmt.Printf("using SQL connection URL: %s\n", sqlConn.url)
   267  	}
   268  
   269  	name := args[0]
   270  	out, err := os.Create(name)
   271  	if err != nil {
   272  		return err
   273  	}
   274  	fmt.Printf("writing %s\n", name)
   275  
   276  	z := newZipper(out)
   277  	defer func() {
   278  		cErr := z.close()
   279  		retErr = errors.CombineErrors(retErr, cErr)
   280  	}()
   281  
   282  	timeout := 10 * time.Second
   283  	if cliCtx.cmdTimeout != 0 {
   284  		timeout = cliCtx.cmdTimeout
   285  	}
   286  
   287  	var runZipRequest = func(r zipRequest) error {
   288  		var data interface{}
   289  		err = runZipRequestWithTimeout(baseCtx, "requesting data for "+r.pathName, timeout, func(ctx context.Context) error {
   290  			data, err = r.fn(ctx)
   291  			return err
   292  		})
   293  		return z.createJSONOrError(r.pathName+".json", data, err)
   294  	}
   295  
   296  	for _, r := range []zipRequest{
   297  		{
   298  			fn: func(ctx context.Context) (interface{}, error) {
   299  				return admin.Events(ctx, &serverpb.EventsRequest{})
   300  			},
   301  			pathName: eventsName,
   302  		},
   303  		{
   304  			fn: func(ctx context.Context) (interface{}, error) {
   305  				return admin.RangeLog(ctx, &serverpb.RangeLogRequest{})
   306  			},
   307  			pathName: rangelogName,
   308  		},
   309  		{
   310  			fn: func(ctx context.Context) (interface{}, error) {
   311  				return admin.Liveness(ctx, &serverpb.LivenessRequest{})
   312  			},
   313  			pathName: livenessName,
   314  		},
   315  		{
   316  			fn: func(ctx context.Context) (interface{}, error) {
   317  				return admin.Settings(ctx, &serverpb.SettingsRequest{})
   318  			},
   319  			pathName: settingsName,
   320  		},
   321  		{
   322  			fn: func(ctx context.Context) (interface{}, error) {
   323  				return status.ProblemRanges(ctx, &serverpb.ProblemRangesRequest{})
   324  			},
   325  			pathName: reportsPrefix + "/problemranges",
   326  		},
   327  	} {
   328  		if err := runZipRequest(r); err != nil {
   329  			return err
   330  		}
   331  	}
   332  
   333  	for _, table := range debugZipTablesPerCluster {
   334  		selectClause, ok := customSelectClause[table]
   335  		if !ok {
   336  			selectClause = "*"
   337  		}
   338  		if err := dumpTableDataForZip(z, sqlConn, timeout, base, table, selectClause); err != nil {
   339  			return errors.Wrapf(err, "fetching %s", table)
   340  		}
   341  	}
   342  
   343  	{
   344  		var nodes *serverpb.NodesResponse
   345  		err := runZipRequestWithTimeout(baseCtx, "requesting nodes", timeout, func(ctx context.Context) error {
   346  			nodes, err = status.Nodes(ctx, &serverpb.NodesRequest{})
   347  			return err
   348  		})
   349  		if cErr := z.createJSONOrError(base+"/nodes.json", nodes, err); cErr != nil {
   350  			return cErr
   351  		}
   352  
   353  		// In case nodes came up back empty (the Nodes() RPC failed), we
   354  		// still want to inspect the per-node endpoints on the head
   355  		// node. As per the above, we were able to connect at least to
   356  		// that.
   357  		nodeList := []statuspb.NodeStatus{{Desc: roachpb.NodeDescriptor{
   358  			NodeID:     nodeD.NodeID,
   359  			Address:    nodeD.Address,
   360  			SQLAddress: nodeD.SQLAddress,
   361  		}}}
   362  		if nodes != nil {
   363  			// If the nodes were found, use that instead.
   364  			nodeList = nodes.Nodes
   365  		}
   366  
   367  		// We'll want livenesses to decide whether a node is decommissioned.
   368  		var lresponse *serverpb.LivenessResponse
   369  		err = runZipRequestWithTimeout(baseCtx, "requesting liveness", timeout, func(ctx context.Context) error {
   370  			lresponse, err = admin.Liveness(ctx, &serverpb.LivenessRequest{})
   371  			return err
   372  		})
   373  		if cErr := z.createJSONOrError(base+"/liveness.json", nodes, err); cErr != nil {
   374  			return cErr
   375  		}
   376  		livenessByNodeID := map[roachpb.NodeID]kvserverpb.NodeLivenessStatus{}
   377  		if lresponse != nil {
   378  			livenessByNodeID = lresponse.Statuses
   379  		}
   380  
   381  		for _, node := range nodeList {
   382  			nodeID := node.Desc.NodeID
   383  
   384  			liveness := livenessByNodeID[nodeID]
   385  			if liveness == kvserverpb.NodeLivenessStatus_DECOMMISSIONED {
   386  				// Decommissioned + process terminated. Let's not waste time
   387  				// on this node.
   388  				//
   389  				// NB: we still inspect DECOMMISSIONING nodes (marked as
   390  				// decommissioned but the process is still alive) to get a
   391  				// chance to collect their log files.
   392  				//
   393  				// NB: we still inspect DEAD nodes because even though they
   394  				// don't heartbeat their liveness record their process might
   395  				// still be up and willing to deliver some log files.
   396  				continue
   397  			}
   398  
   399  			id := fmt.Sprintf("%d", nodeID)
   400  			prefix := fmt.Sprintf("%s/%s", nodesPrefix, id)
   401  
   402  			if !zipCtx.nodes.isIncluded(nodeID) {
   403  				if err := z.createRaw(prefix+".skipped",
   404  					[]byte(fmt.Sprintf("skipping excluded node %d\n", nodeID))); err != nil {
   405  					return err
   406  				}
   407  				continue
   408  			}
   409  
   410  			// Don't use sqlConn because that's only for is the node `debug
   411  			// zip` was pointed at, but here we want to connect to nodes
   412  			// individually to grab node- local SQL tables. Try to guess by
   413  			// replacing the host in the connection string; this may or may
   414  			// not work and if it doesn't, we let the invalid curSQLConn get
   415  			// used anyway so that anything that does *not* need it will
   416  			// still happen.
   417  			sqlAddr := node.Desc.SQLAddress
   418  			if sqlAddr.IsEmpty() {
   419  				// No SQL address: either a pre-19.2 node, or same address for both
   420  				// SQL and RPC.
   421  				sqlAddr = node.Desc.Address
   422  			}
   423  			curSQLConn := guessNodeURL(sqlConn.url, sqlAddr.AddressField)
   424  			if err := z.createJSON(prefix+"/status.json", node); err != nil {
   425  				return err
   426  			}
   427  			fmt.Printf("using SQL connection URL for node %s: %s\n", id, curSQLConn.url)
   428  
   429  			for _, table := range debugZipTablesPerNode {
   430  				selectClause, ok := customSelectClause[table]
   431  				if !ok {
   432  					selectClause = "*"
   433  				}
   434  				if err := dumpTableDataForZip(z, curSQLConn, timeout, prefix, table, selectClause); err != nil {
   435  					return errors.Wrapf(err, "fetching %s", table)
   436  				}
   437  			}
   438  
   439  			for _, r := range []zipRequest{
   440  				{
   441  					fn: func(ctx context.Context) (interface{}, error) {
   442  						return status.Details(ctx, &serverpb.DetailsRequest{NodeId: id})
   443  					},
   444  					pathName: prefix + "/details",
   445  				},
   446  				{
   447  					fn: func(ctx context.Context) (interface{}, error) {
   448  						return status.Gossip(ctx, &serverpb.GossipRequest{NodeId: id})
   449  					},
   450  					pathName: prefix + "/gossip",
   451  				},
   452  				{
   453  					fn: func(ctx context.Context) (interface{}, error) {
   454  						return status.EngineStats(ctx, &serverpb.EngineStatsRequest{NodeId: id})
   455  					},
   456  					pathName: prefix + "/enginestats",
   457  				},
   458  			} {
   459  				if err := runZipRequest(r); err != nil {
   460  					return err
   461  				}
   462  			}
   463  
   464  			var stacksData []byte
   465  			err = runZipRequestWithTimeout(baseCtx, "requesting stacks for node "+id, timeout,
   466  				func(ctx context.Context) error {
   467  					stacks, err := status.Stacks(ctx, &serverpb.StacksRequest{
   468  						NodeId: id,
   469  						Type:   serverpb.StacksType_GOROUTINE_STACKS,
   470  					})
   471  					if err == nil {
   472  						stacksData = stacks.Data
   473  					}
   474  					return err
   475  				})
   476  			if err := z.createRawOrError(prefix+"/stacks.txt", stacksData, err); err != nil {
   477  				return err
   478  			}
   479  
   480  			var threadData []byte
   481  			err = runZipRequestWithTimeout(baseCtx, "requesting threads for node "+id, timeout,
   482  				func(ctx context.Context) error {
   483  					threads, err := status.Stacks(ctx, &serverpb.StacksRequest{
   484  						NodeId: id,
   485  						Type:   serverpb.StacksType_THREAD_STACKS,
   486  					})
   487  					if err == nil {
   488  						threadData = threads.Data
   489  					}
   490  					return err
   491  				})
   492  			if err := z.createRawOrError(prefix+"/threads.txt", threadData, err); err != nil {
   493  				return err
   494  			}
   495  
   496  			var heapData []byte
   497  			err = runZipRequestWithTimeout(baseCtx, "requesting heap profile for node "+id, timeout,
   498  				func(ctx context.Context) error {
   499  					heap, err := status.Profile(ctx, &serverpb.ProfileRequest{
   500  						NodeId: id,
   501  						Type:   serverpb.ProfileRequest_HEAP,
   502  					})
   503  					if err == nil {
   504  						heapData = heap.Data
   505  					}
   506  					return err
   507  				})
   508  			if err := z.createRawOrError(prefix+"/heap.pprof", heapData, err); err != nil {
   509  				return err
   510  			}
   511  
   512  			var profiles *serverpb.GetFilesResponse
   513  			if err := runZipRequestWithTimeout(baseCtx, "requesting heap files for node "+id, timeout,
   514  				func(ctx context.Context) error {
   515  					profiles, err = status.GetFiles(ctx, &serverpb.GetFilesRequest{
   516  						NodeId:   id,
   517  						Type:     serverpb.FileType_HEAP,
   518  						Patterns: []string{"*"},
   519  					})
   520  					return err
   521  				}); err != nil {
   522  				if err := z.createError(prefix+"/heapprof", err); err != nil {
   523  					return err
   524  				}
   525  			} else {
   526  				fmt.Printf("%d found\n", len(profiles.Files))
   527  				for _, file := range profiles.Files {
   528  					name := prefix + "/heapprof/" + file.Name + ".pprof"
   529  					if err := z.createRaw(name, file.Contents); err != nil {
   530  						return err
   531  					}
   532  				}
   533  			}
   534  
   535  			var goroutinesResp *serverpb.GetFilesResponse
   536  			if err := runZipRequestWithTimeout(baseCtx, "requesting goroutine files for node "+id, timeout,
   537  				func(ctx context.Context) error {
   538  					goroutinesResp, err = status.GetFiles(ctx, &serverpb.GetFilesRequest{
   539  						NodeId:   id,
   540  						Type:     serverpb.FileType_GOROUTINES,
   541  						Patterns: []string{"*"},
   542  					})
   543  					return err
   544  				}); err != nil {
   545  				if err := z.createError(prefix+"/goroutines", err); err != nil {
   546  					return err
   547  				}
   548  			} else {
   549  				fmt.Printf("%d found\n", len(goroutinesResp.Files))
   550  				for _, file := range goroutinesResp.Files {
   551  					// NB: the files have a .txt.gz suffix already.
   552  					name := prefix + "/goroutines/" + file.Name
   553  					if err := z.createRawOrError(name, file.Contents, err); err != nil {
   554  						return err
   555  					}
   556  				}
   557  			}
   558  
   559  			var logs *serverpb.LogFilesListResponse
   560  			if err := runZipRequestWithTimeout(baseCtx, "requesting log files list", timeout,
   561  				func(ctx context.Context) error {
   562  					logs, err = status.LogFilesList(
   563  						ctx, &serverpb.LogFilesListRequest{NodeId: id})
   564  					return err
   565  				}); err != nil {
   566  				if err := z.createError(prefix+"/logs", err); err != nil {
   567  					return err
   568  				}
   569  			} else {
   570  				fmt.Printf("%d found\n", len(logs.Files))
   571  				for _, file := range logs.Files {
   572  					name := prefix + "/logs/" + file.Name
   573  					var entries *serverpb.LogEntriesResponse
   574  					if err := runZipRequestWithTimeout(baseCtx, fmt.Sprintf("requesting log file %s", file.Name), timeout,
   575  						func(ctx context.Context) error {
   576  							entries, err = status.LogFile(
   577  								ctx, &serverpb.LogFileRequest{NodeId: id, File: file.Name})
   578  							return err
   579  						}); err != nil {
   580  						if err := z.createError(name, err); err != nil {
   581  							return err
   582  						}
   583  						continue
   584  					}
   585  					logOut, err := z.create(name, timeutil.Unix(0, file.ModTimeNanos))
   586  					if err != nil {
   587  						return err
   588  					}
   589  					for _, e := range entries.Entries {
   590  						if err := e.Format(logOut); err != nil {
   591  							return err
   592  						}
   593  					}
   594  				}
   595  			}
   596  
   597  			var ranges *serverpb.RangesResponse
   598  			if err := runZipRequestWithTimeout(baseCtx, "requesting ranges", timeout, func(ctx context.Context) error {
   599  				ranges, err = status.Ranges(ctx, &serverpb.RangesRequest{NodeId: id})
   600  				return err
   601  			}); err != nil {
   602  				if err := z.createError(prefix+"/ranges", err); err != nil {
   603  					return err
   604  				}
   605  			} else {
   606  				fmt.Printf("%d found\n", len(ranges.Ranges))
   607  				sort.Slice(ranges.Ranges, func(i, j int) bool {
   608  					return ranges.Ranges[i].State.Desc.RangeID <
   609  						ranges.Ranges[j].State.Desc.RangeID
   610  				})
   611  				for _, r := range ranges.Ranges {
   612  					name := fmt.Sprintf("%s/ranges/%s", prefix, r.State.Desc.RangeID)
   613  					if err := z.createJSON(name+".json", r); err != nil {
   614  						return err
   615  					}
   616  				}
   617  			}
   618  		}
   619  	}
   620  
   621  	{
   622  		var databases *serverpb.DatabasesResponse
   623  		if err := runZipRequestWithTimeout(baseCtx, "requesting list of SQL databases", timeout, func(ctx context.Context) error {
   624  			databases, err = admin.Databases(ctx, &serverpb.DatabasesRequest{})
   625  			return err
   626  		}); err != nil {
   627  			if err := z.createError(schemaPrefix, err); err != nil {
   628  				return err
   629  			}
   630  		} else {
   631  			fmt.Printf("%d found\n", len(databases.Databases))
   632  			var dbEscaper fileNameEscaper
   633  			for _, dbName := range databases.Databases {
   634  				prefix := schemaPrefix + "/" + dbEscaper.escape(dbName)
   635  				var database *serverpb.DatabaseDetailsResponse
   636  				requestErr := runZipRequestWithTimeout(baseCtx, fmt.Sprintf("requesting database details for %s", dbName), timeout,
   637  					func(ctx context.Context) error {
   638  						database, err = admin.DatabaseDetails(ctx, &serverpb.DatabaseDetailsRequest{Database: dbName})
   639  						return err
   640  					})
   641  				if err := z.createJSONOrError(prefix+"@details.json", database, requestErr); err != nil {
   642  					return err
   643  				}
   644  				if requestErr != nil {
   645  					continue
   646  				}
   647  
   648  				fmt.Printf("%d tables found\n", len(database.TableNames))
   649  				var tbEscaper fileNameEscaper
   650  				for _, tableName := range database.TableNames {
   651  					name := prefix + "/" + tbEscaper.escape(tableName)
   652  					var table *serverpb.TableDetailsResponse
   653  					err := runZipRequestWithTimeout(baseCtx, fmt.Sprintf("requesting table details for %s.%s", dbName, tableName), timeout,
   654  						func(ctx context.Context) error {
   655  							table, err = admin.TableDetails(ctx, &serverpb.TableDetailsRequest{Database: dbName, Table: tableName})
   656  							return err
   657  						})
   658  					if err := z.createJSONOrError(name+".json", table, err); err != nil {
   659  						return err
   660  					}
   661  				}
   662  			}
   663  		}
   664  	}
   665  
   666  	return nil
   667  }
   668  
   669  type fileNameEscaper struct {
   670  	counters map[string]int
   671  }
   672  
   673  // escape ensures that f is stripped of characters that
   674  // may be invalid in file names. The characters are also lowercased
   675  // to ensure proper normalization in case-insensitive filesystems.
   676  func (fne *fileNameEscaper) escape(f string) string {
   677  	f = strings.ToLower(f)
   678  	var out strings.Builder
   679  	for _, c := range f {
   680  		if c < 127 && (unicode.IsLetter(c) || unicode.IsDigit(c)) {
   681  			out.WriteRune(c)
   682  		} else {
   683  			out.WriteByte('_')
   684  		}
   685  	}
   686  	objName := out.String()
   687  	result := objName
   688  
   689  	if fne.counters == nil {
   690  		fne.counters = make(map[string]int)
   691  	}
   692  	cnt := fne.counters[objName]
   693  	if cnt > 0 {
   694  		result += fmt.Sprintf("-%d", cnt)
   695  	}
   696  	cnt++
   697  	fne.counters[objName] = cnt
   698  	return result
   699  }
   700  
   701  func dumpTableDataForZip(
   702  	z *zipper, conn *sqlConn, timeout time.Duration, base, table, selectClause string,
   703  ) error {
   704  	query := fmt.Sprintf(`SET statement_timeout = '%s'; SELECT %s FROM %s`, timeout, selectClause, table)
   705  	baseName := base + "/" + table
   706  
   707  	fmt.Printf("retrieving SQL data for %s... ", table)
   708  	const maxRetries = 5
   709  	suffix := ""
   710  	for numRetries := 1; numRetries <= maxRetries; numRetries++ {
   711  		name := baseName + suffix + ".txt"
   712  		w, err := z.create(name, time.Time{})
   713  		if err != nil {
   714  			return err
   715  		}
   716  		// Pump the SQL rows directly into the zip writer, to avoid
   717  		// in-RAM buffering.
   718  		if err := runQueryAndFormatResults(conn, w, makeQuery(query)); err != nil {
   719  			if cErr := z.createError(name, err); cErr != nil {
   720  				return cErr
   721  			}
   722  			var pqErr *pq.Error
   723  			if !errors.As(err, &pqErr) {
   724  				// Not a SQL error. Nothing to retry.
   725  				break
   726  			}
   727  			if pqErr.Code != pgcode.SerializationFailure {
   728  				// A non-retry error. We've printed the error, and
   729  				// there's nothing to retry. Stop here.
   730  				break
   731  			}
   732  			// We've encountered a retry error. Add a suffix then loop.
   733  			suffix = fmt.Sprintf(".%d", numRetries)
   734  			continue
   735  		}
   736  		break
   737  	}
   738  	return nil
   739  }
   740  
   741  type nodeSelection struct {
   742  	inclusive     rangeSelection
   743  	exclusive     rangeSelection
   744  	includedCache map[int]struct{}
   745  	excludedCache map[int]struct{}
   746  }
   747  
   748  func (n *nodeSelection) isIncluded(nodeID roachpb.NodeID) bool {
   749  	// Avoid recomputing the maps on every call.
   750  	if n.includedCache == nil {
   751  		n.includedCache = n.inclusive.items()
   752  	}
   753  	if n.excludedCache == nil {
   754  		n.excludedCache = n.exclusive.items()
   755  	}
   756  
   757  	// If the included cache is empty, then we're assuming the node is included.
   758  	isIncluded := true
   759  	if len(n.includedCache) > 0 {
   760  		_, isIncluded = n.includedCache[int(nodeID)]
   761  	}
   762  	// Then filter out excluded IDs.
   763  	if _, excluded := n.excludedCache[int(nodeID)]; excluded {
   764  		isIncluded = false
   765  	}
   766  	return isIncluded
   767  }
   768  
   769  type rangeSelection struct {
   770  	input  string
   771  	ranges []vrange
   772  }
   773  
   774  type vrange struct {
   775  	a, b int
   776  }
   777  
   778  func (r *rangeSelection) String() string { return r.input }
   779  
   780  func (r *rangeSelection) Type() string {
   781  	return "a-b,c,d-e,..."
   782  }
   783  
   784  func (r *rangeSelection) Set(v string) error {
   785  	r.input = v
   786  	for _, rs := range strings.Split(v, ",") {
   787  		var thisRange vrange
   788  		if strings.Contains(rs, "-") {
   789  			ab := strings.SplitN(rs, "-", 2)
   790  			a, err := strconv.Atoi(ab[0])
   791  			if err != nil {
   792  				return err
   793  			}
   794  			b, err := strconv.Atoi(ab[1])
   795  			if err != nil {
   796  				return err
   797  			}
   798  			if b < a {
   799  				return errors.New("invalid range")
   800  			}
   801  			thisRange = vrange{a, b}
   802  		} else {
   803  			a, err := strconv.Atoi(rs)
   804  			if err != nil {
   805  				return err
   806  			}
   807  			thisRange = vrange{a, a}
   808  		}
   809  		r.ranges = append(r.ranges, thisRange)
   810  	}
   811  	return nil
   812  }
   813  
   814  // items returns the values selected by the range selection
   815  func (r *rangeSelection) items() map[int]struct{} {
   816  	s := map[int]struct{}{}
   817  	for _, vr := range r.ranges {
   818  		for i := vr.a; i <= vr.b; i++ {
   819  			s[i] = struct{}{}
   820  		}
   821  	}
   822  	return s
   823  }