github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cli/start.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cli/start.go (about)

     1  // Copyright 2015 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package cli
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"flag"
    17  	"fmt"
    18  	"io/ioutil"
    19  	"net"
    20  	"net/url"
    21  	"os"
    22  	"os/signal"
    23  	"path/filepath"
    24  	"runtime"
    25  	"runtime/pprof"
    26  	"strings"
    27  	"text/tabwriter"
    28  	"time"
    29  
    30  	"github.com/cockroachdb/cockroach/pkg/base"
    31  	"github.com/cockroachdb/cockroach/pkg/build"
    32  	"github.com/cockroachdb/cockroach/pkg/cli/cliflags"
    33  	"github.com/cockroachdb/cockroach/pkg/geo/geos"
    34  	"github.com/cockroachdb/cockroach/pkg/rpc"
    35  	"github.com/cockroachdb/cockroach/pkg/security"
    36  	"github.com/cockroachdb/cockroach/pkg/server"
    37  	"github.com/cockroachdb/cockroach/pkg/server/status"
    38  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    39  	"github.com/cockroachdb/cockroach/pkg/storage"
    40  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    41  	"github.com/cockroachdb/cockroach/pkg/util/envutil"
    42  	"github.com/cockroachdb/cockroach/pkg/util/errorutil/unimplemented"
    43  	"github.com/cockroachdb/cockroach/pkg/util/grpcutil"
    44  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    45  	"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
    46  	"github.com/cockroachdb/cockroach/pkg/util/log"
    47  	"github.com/cockroachdb/cockroach/pkg/util/log/logflags"
    48  	"github.com/cockroachdb/cockroach/pkg/util/sdnotify"
    49  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    50  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    51  	"github.com/cockroachdb/cockroach/pkg/util/sysutil"
    52  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    53  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    54  	"github.com/cockroachdb/errors"
    55  	"github.com/cockroachdb/pebble"
    56  	opentracing "github.com/opentracing/opentracing-go"
    57  	"github.com/spf13/cobra"
    58  	"google.golang.org/grpc"
    59  )
    60  
    61  // jemallocHeapDump is an optional function to be called at heap dump time.
    62  // This will be non-nil when jemalloc is linked in with profiling enabled.
    63  // The function takes a filename to write the profile to.
    64  var jemallocHeapDump func(string) error
    65  
    66  // startCmd starts a node by initializing the stores and joining
    67  // the cluster.
    68  var startCmd = &cobra.Command{
    69  	Use:   "start",
    70  	Short: "start a node in a multi-node cluster",
    71  	Long: `
    72  Start a CockroachDB node, which will export data from one or more
    73  storage devices, specified via --store flags.
    74  
    75  Specify the --join flag to point to another node or nodes that are
    76  part of the same cluster. The other nodes do not need to be started
    77  yet, and if the address of the other nodes to be added are not yet
    78  known it is legal for the first node to join itself.
    79  
    80  If --join is not specified, the cluster will also be initialized.
    81  THIS BEHAVIOR IS DEPRECATED; consider using 'cockroach init' or
    82  'cockroach start-single-node' instead.
    83  `,
    84  	Example: `  cockroach start --insecure --store=attrs=ssd,path=/mnt/ssd1 --join=host:port,[host:port]`,
    85  	Args:    cobra.NoArgs,
    86  	RunE:    maybeShoutError(MaybeDecorateGRPCError(runStartJoin)),
    87  }
    88  
    89  // startSingleNodeCmd starts a node by initializing the stores.
    90  var startSingleNodeCmd = &cobra.Command{
    91  	Use:   "start-single-node",
    92  	Short: "start a single-node cluster",
    93  	Long: `
    94  Start a CockroachDB node, which will export data from one or more
    95  storage devices, specified via --store flags.
    96  The cluster will also be automatically initialized with
    97  replication disabled (replication factor = 1).
    98  `,
    99  	Example: `  cockroach start-single-node --insecure --store=attrs=ssd,path=/mnt/ssd1`,
   100  	Args:    cobra.NoArgs,
   101  	RunE:    maybeShoutError(MaybeDecorateGRPCError(runStartSingleNode)),
   102  }
   103  
   104  // StartCmds exports startCmd and startSingleNodeCmds so that other
   105  // packages can add flags to them.
   106  var StartCmds = []*cobra.Command{startCmd, startSingleNodeCmd}
   107  
   108  // maxSizePerProfile is the maximum total size in bytes for profiles per
   109  // profile type.
   110  var maxSizePerProfile = envutil.EnvOrDefaultInt64(
   111  	"COCKROACH_MAX_SIZE_PER_PROFILE", 100<<20 /* 100 MB */)
   112  
   113  // gcProfiles removes old profiles matching the specified prefix when the sum
   114  // of newer profiles is larger than maxSize. Requires that the suffix used for
   115  // the profiles indicates age (e.g. by using a date/timestamp suffix) such that
   116  // sorting the filenames corresponds to ordering the profiles from oldest to
   117  // newest.
   118  func gcProfiles(dir, prefix string, maxSize int64) {
   119  	files, err := ioutil.ReadDir(dir)
   120  	if err != nil {
   121  		log.Warningf(context.Background(), "%v", err)
   122  		return
   123  	}
   124  	var sum int64
   125  	var found int
   126  	for i := len(files) - 1; i >= 0; i-- {
   127  		f := files[i]
   128  		if !f.Mode().IsRegular() {
   129  			continue
   130  		}
   131  		if !strings.HasPrefix(f.Name(), prefix) {
   132  			continue
   133  		}
   134  		found++
   135  		sum += f.Size()
   136  		if found == 1 {
   137  			// Always keep the most recent profile.
   138  			continue
   139  		}
   140  		if sum <= maxSize {
   141  			continue
   142  		}
   143  		if err := os.Remove(filepath.Join(dir, f.Name())); err != nil {
   144  			log.Infof(context.Background(), "%v", err)
   145  		}
   146  	}
   147  }
   148  
   149  func initMemProfile(ctx context.Context, dir string) {
   150  	const jeprof = "jeprof."
   151  	const memprof = "memprof."
   152  
   153  	gcProfiles(dir, jeprof, maxSizePerProfile)
   154  	gcProfiles(dir, memprof, maxSizePerProfile)
   155  
   156  	memProfileInterval := envutil.EnvOrDefaultDuration("COCKROACH_MEMPROF_INTERVAL", -1)
   157  	if memProfileInterval <= 0 {
   158  		return
   159  	}
   160  	if min := time.Second; memProfileInterval < min {
   161  		log.Infof(ctx, "fixing excessively short memory profiling interval: %s -> %s",
   162  			memProfileInterval, min)
   163  		memProfileInterval = min
   164  	}
   165  
   166  	if jemallocHeapDump != nil {
   167  		log.Infof(ctx, "writing go and jemalloc memory profiles to %s every %s", dir, memProfileInterval)
   168  	} else {
   169  		log.Infof(ctx, "writing go only memory profiles to %s every %s", dir, memProfileInterval)
   170  		log.Infof(ctx, `to enable jmalloc profiling: "export MALLOC_CONF=prof:true" or "ln -s prof:true /etc/malloc.conf"`)
   171  	}
   172  
   173  	go func() {
   174  		ctx := context.Background()
   175  		t := time.NewTicker(memProfileInterval)
   176  		defer t.Stop()
   177  
   178  		for {
   179  			<-t.C
   180  
   181  			func() {
   182  				const format = "2006-01-02T15_04_05.999"
   183  				suffix := timeutil.Now().Format(format)
   184  
   185  				// Try jemalloc heap profile first, we only log errors.
   186  				if jemallocHeapDump != nil {
   187  					jepath := filepath.Join(dir, jeprof+suffix)
   188  					if err := jemallocHeapDump(jepath); err != nil {
   189  						log.Warningf(ctx, "error writing jemalloc heap %s: %s", jepath, err)
   190  					}
   191  					gcProfiles(dir, jeprof, maxSizePerProfile)
   192  				}
   193  
   194  				path := filepath.Join(dir, memprof+suffix)
   195  				// Try writing a go heap profile.
   196  				f, err := os.Create(path)
   197  				if err != nil {
   198  					log.Warningf(ctx, "error creating go heap file %s", err)
   199  					return
   200  				}
   201  				defer f.Close()
   202  				if err = pprof.WriteHeapProfile(f); err != nil {
   203  					log.Warningf(ctx, "error writing go heap %s: %s", path, err)
   204  					return
   205  				}
   206  				gcProfiles(dir, memprof, maxSizePerProfile)
   207  			}()
   208  		}
   209  	}()
   210  }
   211  
   212  func initCPUProfile(ctx context.Context, dir string) {
   213  	const cpuprof = "cpuprof."
   214  	gcProfiles(dir, cpuprof, maxSizePerProfile)
   215  
   216  	cpuProfileInterval := envutil.EnvOrDefaultDuration("COCKROACH_CPUPROF_INTERVAL", -1)
   217  	if cpuProfileInterval <= 0 {
   218  		return
   219  	}
   220  	if min := time.Second; cpuProfileInterval < min {
   221  		log.Infof(ctx, "fixing excessively short cpu profiling interval: %s -> %s",
   222  			cpuProfileInterval, min)
   223  		cpuProfileInterval = min
   224  	}
   225  
   226  	go func() {
   227  		defer log.RecoverAndReportPanic(ctx, &serverCfg.Settings.SV)
   228  
   229  		ctx := context.Background()
   230  
   231  		t := time.NewTicker(cpuProfileInterval)
   232  		defer t.Stop()
   233  
   234  		var currentProfile *os.File
   235  		defer func() {
   236  			if currentProfile != nil {
   237  				pprof.StopCPUProfile()
   238  				currentProfile.Close()
   239  			}
   240  		}()
   241  
   242  		for {
   243  			func() {
   244  				const format = "2006-01-02T15_04_05.999"
   245  				suffix := timeutil.Now().Add(cpuProfileInterval).Format(format)
   246  				f, err := os.Create(filepath.Join(dir, cpuprof+suffix))
   247  				if err != nil {
   248  					log.Warningf(ctx, "error creating go cpu file %s", err)
   249  					return
   250  				}
   251  
   252  				// Stop the current profile if it exists.
   253  				if currentProfile != nil {
   254  					pprof.StopCPUProfile()
   255  					currentProfile.Close()
   256  					currentProfile = nil
   257  					gcProfiles(dir, cpuprof, maxSizePerProfile)
   258  				}
   259  
   260  				// Start the new profile.
   261  				if err := pprof.StartCPUProfile(f); err != nil {
   262  					log.Warningf(ctx, "unable to start cpu profile: %v", err)
   263  					f.Close()
   264  					return
   265  				}
   266  				currentProfile = f
   267  			}()
   268  
   269  			<-t.C
   270  		}
   271  	}()
   272  }
   273  
   274  func initBlockProfile() {
   275  	// Enable the block profile for a sample of mutex and channel operations.
   276  	// Smaller values provide more accurate profiles but are more
   277  	// expensive. 0 and 1 are special: 0 disables the block profile and
   278  	// 1 captures 100% of block events. For other values, the profiler
   279  	// will sample one event per X nanoseconds spent blocking.
   280  	//
   281  	// The block profile can be viewed with `pprof http://HOST:PORT/debug/pprof/block`
   282  	//
   283  	// The utility of the block profile (aka blocking profile) has diminished
   284  	// with the advent of the mutex profile. We currently leave the block profile
   285  	// disabled by default as it has a non-zero performance impact.
   286  	d := envutil.EnvOrDefaultInt64("COCKROACH_BLOCK_PROFILE_RATE", 0)
   287  	runtime.SetBlockProfileRate(int(d))
   288  }
   289  
   290  func initMutexProfile() {
   291  	// Enable the mutex profile for a fraction of mutex contention events.
   292  	// Smaller values provide more accurate profiles but are more expensive. 0
   293  	// and 1 are special: 0 disables the mutex profile and 1 captures 100% of
   294  	// mutex contention events. For other values, the profiler will sample on
   295  	// average 1/X events.
   296  	//
   297  	// The mutex profile can be viewed with `pprof http://HOST:PORT/debug/pprof/mutex`
   298  	d := envutil.EnvOrDefaultInt("COCKROACH_MUTEX_PROFILE_RATE",
   299  		1000 /* 1 sample per 1000 mutex contention events */)
   300  	runtime.SetMutexProfileFraction(d)
   301  }
   302  
   303  var cacheSizeValue = newBytesOrPercentageValue(&serverCfg.CacheSize, memoryPercentResolver)
   304  var sqlSizeValue = newBytesOrPercentageValue(&serverCfg.MemoryPoolSize, memoryPercentResolver)
   305  var diskTempStorageSizeValue = newBytesOrPercentageValue(nil /* v */, nil /* percentResolver */)
   306  
   307  func initExternalIODir(ctx context.Context, firstStore base.StoreSpec) (string, error) {
   308  	externalIODir := startCtx.externalIODir
   309  	if externalIODir == "" && !firstStore.InMemory {
   310  		externalIODir = filepath.Join(firstStore.Path, "extern")
   311  	}
   312  	if externalIODir == "" || externalIODir == "disabled" {
   313  		return "", nil
   314  	}
   315  	if !filepath.IsAbs(externalIODir) {
   316  		return "", errors.Errorf("%s path must be absolute", cliflags.ExternalIODir.Name)
   317  	}
   318  	return externalIODir, nil
   319  }
   320  
   321  func initTempStorageConfig(
   322  	ctx context.Context, st *cluster.Settings, stopper *stop.Stopper, useStore base.StoreSpec,
   323  ) (base.TempStorageConfig, error) {
   324  	var recordPath string
   325  	if !useStore.InMemory {
   326  		recordPath = filepath.Join(useStore.Path, server.TempDirsRecordFilename)
   327  	}
   328  
   329  	var err error
   330  	// Need to first clean up any abandoned temporary directories from
   331  	// the temporary directory record file before creating any new
   332  	// temporary directories in case the disk is completely full.
   333  	if recordPath != "" {
   334  		if err = storage.CleanupTempDirs(recordPath); err != nil {
   335  			return base.TempStorageConfig{}, errors.Wrap(err, "could not cleanup temporary directories from record file")
   336  		}
   337  	}
   338  
   339  	// The temp store size can depend on the location of the first regular store
   340  	// (if it's expressed as a percentage), so we resolve that flag here.
   341  	var tempStorePercentageResolver percentResolverFunc
   342  	if !useStore.InMemory {
   343  		dir := useStore.Path
   344  		// Create the store dir, if it doesn't exist. The dir is required to exist
   345  		// by diskPercentResolverFactory.
   346  		if err = os.MkdirAll(dir, 0755); err != nil {
   347  			return base.TempStorageConfig{}, errors.Wrapf(err, "failed to create dir for first store: %s", dir)
   348  		}
   349  		tempStorePercentageResolver, err = diskPercentResolverFactory(dir)
   350  		if err != nil {
   351  			return base.TempStorageConfig{}, errors.Wrapf(err, "failed to create resolver for: %s", dir)
   352  		}
   353  	} else {
   354  		tempStorePercentageResolver = memoryPercentResolver
   355  	}
   356  	var tempStorageMaxSizeBytes int64
   357  	if err = diskTempStorageSizeValue.Resolve(
   358  		&tempStorageMaxSizeBytes, tempStorePercentageResolver,
   359  	); err != nil {
   360  		return base.TempStorageConfig{}, err
   361  	}
   362  	if !diskTempStorageSizeValue.IsSet() {
   363  		// The default temp storage size is different when the temp
   364  		// storage is in memory (which occurs when no temp directory
   365  		// is specified and the first store is in memory).
   366  		if startCtx.tempDir == "" && useStore.InMemory {
   367  			tempStorageMaxSizeBytes = base.DefaultInMemTempStorageMaxSizeBytes
   368  		} else {
   369  			tempStorageMaxSizeBytes = base.DefaultTempStorageMaxSizeBytes
   370  		}
   371  	}
   372  
   373  	// Initialize a base.TempStorageConfig based on first store's spec and
   374  	// cli flags.
   375  	tempStorageConfig := base.TempStorageConfigFromEnv(
   376  		ctx,
   377  		st,
   378  		useStore,
   379  		startCtx.tempDir,
   380  		tempStorageMaxSizeBytes,
   381  	)
   382  
   383  	// Set temp directory to first store's path if the temp storage is not
   384  	// in memory.
   385  	tempDir := startCtx.tempDir
   386  	if tempDir == "" && !tempStorageConfig.InMemory {
   387  		tempDir = useStore.Path
   388  	}
   389  	// Create the temporary subdirectory for the temp engine.
   390  	if tempStorageConfig.Path, err = storage.CreateTempDir(tempDir, server.TempDirPrefix, stopper); err != nil {
   391  		return base.TempStorageConfig{}, errors.Wrap(err, "could not create temporary directory for temp storage")
   392  	}
   393  
   394  	// We record the new temporary directory in the record file (if it
   395  	// exists) for cleanup in case the node crashes.
   396  	if recordPath != "" {
   397  		if err = storage.RecordTempDir(recordPath, tempStorageConfig.Path); err != nil {
   398  			return base.TempStorageConfig{}, errors.Wrapf(
   399  				err,
   400  				"could not record temporary directory path to record file: %s",
   401  				recordPath,
   402  			)
   403  		}
   404  	}
   405  
   406  	return tempStorageConfig, nil
   407  }
   408  
   409  // Checks if the passed-in engine type is default, and if so, resolves it to
   410  // the storage engine last used to write to the store at dir (or rocksdb if
   411  // a store wasn't found).
   412  func resolveStorageEngineType(
   413  	ctx context.Context, engineType enginepb.EngineType, cfg base.StorageConfig,
   414  ) enginepb.EngineType {
   415  	if engineType == enginepb.EngineTypeDefault {
   416  		engineType = enginepb.EngineTypePebble
   417  		pebbleCfg := &storage.PebbleConfig{
   418  			StorageConfig: cfg,
   419  			Opts:          storage.DefaultPebbleOptions(),
   420  		}
   421  		pebbleCfg.Opts.EnsureDefaults()
   422  		pebbleCfg.Opts.ReadOnly = true
   423  		// Resolve encrypted env options in pebbleCfg and populate pebbleCfg.Opts.FS
   424  		// if necessary (eg. encrypted-at-rest is enabled).
   425  		_, _, err := storage.ResolveEncryptedEnvOptions(pebbleCfg)
   426  		if err != nil {
   427  			log.Infof(ctx, "unable to setup encrypted env to resolve past engine type: %s", err)
   428  			return engineType
   429  		}
   430  
   431  		// Check if this storage directory was last written to by rocksdb. In that
   432  		// case, default to opening a RocksDB engine.
   433  		if version, err := pebble.GetVersion(cfg.Dir, pebbleCfg.Opts.FS); err == nil {
   434  			if strings.HasPrefix(version, "rocksdb") {
   435  				engineType = enginepb.EngineTypeRocksDB
   436  			}
   437  		}
   438  	}
   439  	return engineType
   440  }
   441  
   442  var errCannotUseJoin = errors.New("cannot use --join with 'cockroach start-single-node' -- use 'cockroach start' instead")
   443  
   444  func runStartSingleNode(cmd *cobra.Command, args []string) error {
   445  	joinFlag := flagSetForCmd(cmd).Lookup(cliflags.Join.Name)
   446  	if joinFlag.Changed {
   447  		return errCannotUseJoin
   448  	}
   449  	// Now actually set the flag as changed so that the start code
   450  	// doesn't warn that it was not set.
   451  	joinFlag.Changed = true
   452  	return runStart(cmd, args, true /*disableReplication*/)
   453  }
   454  
   455  func runStartJoin(cmd *cobra.Command, args []string) error {
   456  	return runStart(cmd, args, false /*disableReplication*/)
   457  }
   458  
   459  // runStart starts the cockroach node using --store as the list of
   460  // storage devices ("stores") on this machine and --join as the list
   461  // of other active nodes used to join this node to the cockroach
   462  // cluster, if this is its first time connecting.
   463  //
   464  // If the argument disableReplication is true and we are starting
   465  // a fresh cluster, the replication factor will be disabled in
   466  // all zone configs.
   467  func runStart(cmd *cobra.Command, args []string, disableReplication bool) error {
   468  	tBegin := timeutil.Now()
   469  
   470  	// First things first: if the user wants background processing,
   471  	// relinquish the terminal ASAP by forking and exiting.
   472  	//
   473  	// If executing in the background, the function returns ok == true in
   474  	// the parent process (regardless of err) and the parent exits at
   475  	// this point.
   476  	if ok, err := maybeRerunBackground(); ok {
   477  		return err
   478  	}
   479  
   480  	// Change the permission mask for all created files.
   481  	//
   482  	// We're considering everything produced by a cockroach node
   483  	// to potentially contain sensitive information, so it should
   484  	// not be world-readable.
   485  	disableOtherPermissionBits()
   486  
   487  	// Set up the signal handlers. This also ensures that any of these
   488  	// signals received beyond this point do not interrupt the startup
   489  	// sequence until the point signals are checked below.
   490  	// We want to set up signal handling before starting logging, because
   491  	// logging uses buffering, and we want to be able to sync
   492  	// the buffers in the signal handler below. If we started capturing
   493  	// signals later, some startup logging might be lost.
   494  	signalCh := make(chan os.Signal, 1)
   495  	signal.Notify(signalCh, drainSignals...)
   496  
   497  	// Set up a cancellable context for the entire start command.
   498  	// The context will be canceled at the end.
   499  	ctx, cancel := context.WithCancel(context.Background())
   500  	defer cancel()
   501  
   502  	// Set up a tracing span for the start process.  We want any logging
   503  	// happening beyond this point to be accounted to this start
   504  	// context, including logging related to the initialization of
   505  	// the logging infrastructure below.
   506  	// This span concludes when the startup goroutine started below
   507  	// has completed.
   508  	// TODO(andrei): we don't close the span on the early returns below.
   509  	tracer := serverCfg.Settings.Tracer
   510  	sp := tracer.StartRootSpan("server start", nil /* logTags */, tracing.NonRecordableSpan)
   511  	ctx = opentracing.ContextWithSpan(ctx, sp)
   512  
   513  	// Set up the logging and profiling output.
   514  	//
   515  	// We want to do this as early as possible, because most of the code
   516  	// in CockroachDB may use logging, and until logging has been
   517  	// initialized log files will be created in $TMPDIR instead of their
   518  	// expected location.
   519  	//
   520  	// This initialization uses the various configuration parameters
   521  	// initialized by flag handling (before runStart was called). Any
   522  	// additional server configuration tweaks for the startup process
   523  	// must be necessarily non-logging-related, as logging parameters
   524  	// cannot be picked up beyond this point.
   525  	stopper, err := setupAndInitializeLoggingAndProfiling(ctx, cmd)
   526  	if err != nil {
   527  		return err
   528  	}
   529  
   530  	// If any store has something to say against a server start-up
   531  	// (e.g. previously detected corruption), listen to them now.
   532  	if err := serverCfg.Stores.PriorCriticalAlertError(); err != nil {
   533  		return err
   534  	}
   535  
   536  	// We don't care about GRPCs fairly verbose logs in most client commands,
   537  	// but when actually starting a server, we enable them.
   538  	grpcutil.SetSeverity(log.Severity_WARNING)
   539  
   540  	// Check the --join flag.
   541  	if !flagSetForCmd(cmd).Lookup(cliflags.Join.Name).Changed {
   542  		log.Shout(ctx, log.Severity_WARNING,
   543  			"running 'cockroach start' without --join is deprecated.\n"+
   544  				"Consider using 'cockroach start-single-node' or 'cockroach init' instead.")
   545  	}
   546  
   547  	// Now perform additional configuration tweaks specific to the start
   548  	// command.
   549  
   550  	// Derive temporary/auxiliary directory specifications.
   551  	if serverCfg.Settings.ExternalIODir, err = initExternalIODir(ctx, serverCfg.Stores.Specs[0]); err != nil {
   552  		return err
   553  	}
   554  
   555  	// Build a minimal StorageConfig out of the first store's spec, with enough
   556  	// attributes to be able to read encrypted-at-rest store directories.
   557  	firstSpec := serverCfg.Stores.Specs[0]
   558  	firstStoreConfig := base.StorageConfig{
   559  		Attrs:           firstSpec.Attributes,
   560  		Dir:             firstSpec.Path,
   561  		Settings:        serverCfg.Settings,
   562  		UseFileRegistry: firstSpec.UseFileRegistry,
   563  		ExtraOptions:    firstSpec.ExtraOptions,
   564  	}
   565  	// If the storage engine is set to "default", check the engine type used in
   566  	// this store directory in a past run. If this check fails for any reason,
   567  	// use Pebble as the default engine type.
   568  	serverCfg.StorageEngine = resolveStorageEngineType(ctx, serverCfg.StorageEngine, firstStoreConfig)
   569  
   570  	// Next we initialize the target directory for temporary storage.
   571  	// If encryption at rest is enabled in any fashion, we'll want temp
   572  	// storage to be encrypted too. To achieve this, we use
   573  	// the first encrypted store as temp dir target, if any.
   574  	// If we can't find one, we use the first StoreSpec in the list.
   575  	var specIdx = 0
   576  	for i := range serverCfg.Stores.Specs {
   577  		if serverCfg.Stores.Specs[i].ExtraOptions != nil {
   578  			specIdx = i
   579  		}
   580  	}
   581  
   582  	if serverCfg.TempStorageConfig, err = initTempStorageConfig(
   583  		ctx, serverCfg.Settings, stopper, serverCfg.Stores.Specs[specIdx],
   584  	); err != nil {
   585  		return err
   586  	}
   587  
   588  	// Initialize the node's configuration from startup parameters.
   589  	// This also reads the part of the configuration that comes from
   590  	// environment variables.
   591  	if err := serverCfg.InitNode(ctx); err != nil {
   592  		return errors.Wrap(err, "failed to initialize node")
   593  	}
   594  
   595  	// The configuration is now ready to report to the user and the log
   596  	// file. We had to wait after InitNode() so that all configuration
   597  	// environment variables, which are reported too, have been read and
   598  	// registered.
   599  	reportConfiguration(ctx)
   600  
   601  	// Until/unless CockroachDB embeds its own tz database, we want
   602  	// an early sanity check. It's better to inform the user early
   603  	// than to get surprising errors during SQL queries.
   604  	if err := checkTzDatabaseAvailability(ctx); err != nil {
   605  		return errors.Wrap(err, "failed to initialize node")
   606  	}
   607  
   608  	// ReadyFn will be called when the server has started listening on
   609  	// its network sockets, but perhaps before it has done bootstrapping
   610  	// and thus before Start() completes.
   611  	serverCfg.ReadyFn = func(waitForInit bool) {
   612  		// Inform the user if the network settings are suspicious. We need
   613  		// to do that after starting to listen because we need to know
   614  		// which advertise address NewServer() has decided.
   615  		hintServerCmdFlags(ctx, cmd)
   616  
   617  		// If another process was waiting on the PID (e.g. using a FIFO),
   618  		// this is when we can tell them the node has started listening.
   619  		if startCtx.pidFile != "" {
   620  			log.Infof(ctx, "PID file: %s", startCtx.pidFile)
   621  			if err := ioutil.WriteFile(startCtx.pidFile, []byte(fmt.Sprintf("%d\n", os.Getpid())), 0644); err != nil {
   622  				log.Errorf(ctx, "failed writing the PID: %v", err)
   623  			}
   624  		}
   625  
   626  		// If the invoker has requested an URL update, do it now that
   627  		// the server is ready to accept SQL connections.
   628  		// (Note: as stated above, ReadyFn is called after the server
   629  		// has started listening on its socket, but possibly before
   630  		// the cluster has been initialized and can start processing requests.
   631  		// This is OK for SQL clients, as the connection will be accepted
   632  		// by the network listener and will just wait/suspend until
   633  		// the cluster initializes, at which point it will be picked up
   634  		// and let the client go through, transparently.)
   635  		if startCtx.listeningURLFile != "" {
   636  			log.Infof(ctx, "listening URL file: %s", startCtx.listeningURLFile)
   637  			// (Re-)compute the client connection URL. We cannot do this
   638  			// earlier (e.g. above, in the runStart function) because
   639  			// at this time the address and port have not been resolved yet.
   640  			pgURL, err := serverCfg.PGURL(url.User(security.RootUser))
   641  			if err != nil {
   642  				log.Errorf(ctx, "failed computing the URL: %v", err)
   643  				return
   644  			}
   645  
   646  			if err = ioutil.WriteFile(startCtx.listeningURLFile, []byte(fmt.Sprintf("%s\n", pgURL)), 0644); err != nil {
   647  				log.Errorf(ctx, "failed writing the URL: %v", err)
   648  			}
   649  		}
   650  
   651  		if waitForInit {
   652  			log.Shout(ctx, log.Severity_INFO,
   653  				"initial startup completed.\n"+
   654  					"Node will now attempt to join a running cluster, or wait for `cockroach init`.\n"+
   655  					"Client connections will be accepted after this completes successfully.\n"+
   656  					"Check the log file(s) for progress. ")
   657  		}
   658  
   659  		// Ensure the configuration logging is written to disk in case a
   660  		// process is waiting for the sdnotify readiness to read important
   661  		// information from there.
   662  		log.Flush()
   663  
   664  		// Signal readiness. This unblocks the process when running with
   665  		// --background or under systemd.
   666  		if err := sdnotify.Ready(); err != nil {
   667  			log.Errorf(ctx, "failed to signal readiness using systemd protocol: %s", err)
   668  		}
   669  	}
   670  
   671  	// DelayedBoostrapFn will be called if the boostrap process is
   672  	// taking a bit long.
   673  	serverCfg.DelayedBootstrapFn = func() {
   674  		const msg = `The server appears to be unable to contact the other nodes in the cluster. Please try:
   675  
   676  - starting the other nodes, if you haven't already;
   677  - double-checking that the '--join' and '--listen'/'--advertise' flags are set up correctly;
   678  - running the 'cockroach init' command if you are trying to initialize a new cluster.
   679  
   680  If problems persist, please see %s.`
   681  		docLink := base.DocsURL("cluster-setup-troubleshooting.html")
   682  		if !startCtx.inBackground {
   683  			log.Shoutf(context.Background(), log.Severity_WARNING, msg, docLink)
   684  		} else {
   685  			// Don't shout to stderr since the server will have detached by
   686  			// the time this function gets called.
   687  			log.Warningf(ctx, msg, docLink)
   688  		}
   689  	}
   690  
   691  	// Set up the Geospatial library.
   692  	// We need to make sure this happens before any queries involving geospatial data is executed.
   693  	loc, err := geos.EnsureInit(geos.EnsureInitErrorDisplayPrivate, demoCtx.geoLibsDir)
   694  	if err != nil {
   695  		log.Infof(ctx, "could not initialize GEOS - geospatial functions may not be available: %v", err)
   696  	} else {
   697  		log.Infof(ctx, "GEOS initialized at %s", loc)
   698  	}
   699  
   700  	// Beyond this point, the configuration is set and the server is
   701  	// ready to start.
   702  	log.Info(ctx, "starting cockroach node")
   703  
   704  	// Run the rest of the startup process in a goroutine separate from
   705  	// the main goroutine to avoid preventing proper handling of signals
   706  	// if we get stuck on something during initialization (#10138).
   707  	var serverStatusMu struct {
   708  		syncutil.Mutex
   709  		// Used to synchronize server startup with server shutdown if something
   710  		// interrupts the process during initialization (it isn't safe to try to
   711  		// drain a server that doesn't exist or is in the middle of starting up,
   712  		// or to start a server after draining has begun).
   713  		started, draining bool
   714  	}
   715  	var s *server.Server
   716  	errChan := make(chan error, 1)
   717  	go func() {
   718  		// Ensure that the log files see the startup messages immediately.
   719  		defer log.Flush()
   720  		// If anything goes dramatically wrong, use Go's panic/recover
   721  		// mechanism to intercept the panic and log the panic details to
   722  		// the error reporting server.
   723  		defer func() {
   724  			if s != nil {
   725  				// We only attempt to log the panic details if the server has
   726  				// actually been started successfully. If there's no server,
   727  				// we won't know enough to decide whether reporting is
   728  				// permitted.
   729  				log.RecoverAndReportPanic(ctx, &s.ClusterSettings().SV)
   730  			}
   731  		}()
   732  		// When the start up goroutine completes, so can the start up span
   733  		// defined above.
   734  		defer sp.Finish()
   735  
   736  		// Any error beyond this point should be reported through the
   737  		// errChan defined above. However, in Go the code pattern "if err
   738  		// != nil { return err }" is more common. Expecting contributors
   739  		// to remember to write "if err != nil { errChan <- err }" beyond
   740  		// this point is optimistic. To avoid any error, we capture all
   741  		// the error returns in a closure, and do the errChan reporting,
   742  		// if needed, when that function returns.
   743  		if err := func() error {
   744  			// Instantiate the server.
   745  			var err error
   746  			s, err = server.NewServer(serverCfg, stopper)
   747  			if err != nil {
   748  				return errors.Wrap(err, "failed to start server")
   749  			}
   750  
   751  			// Have we already received a signal to terminate? If so, just
   752  			// stop here.
   753  			serverStatusMu.Lock()
   754  			draining := serverStatusMu.draining
   755  			serverStatusMu.Unlock()
   756  			if draining {
   757  				return nil
   758  			}
   759  
   760  			// Attempt to start the server.
   761  			if err := s.Start(ctx); err != nil {
   762  				if le := (*server.ListenError)(nil); errors.As(err, &le) {
   763  					const errorPrefix = "consider changing the port via --%s"
   764  					if le.Addr == serverCfg.Addr {
   765  						err = errors.Wrapf(err, errorPrefix, cliflags.ListenAddr.Name)
   766  					} else if le.Addr == serverCfg.HTTPAddr {
   767  						err = errors.Wrapf(err, errorPrefix, cliflags.ListenHTTPAddr.Name)
   768  					}
   769  				}
   770  
   771  				return errors.Wrap(err, "cockroach server exited with error")
   772  			}
   773  			// Server started, notify the shutdown monitor running concurrently.
   774  			serverStatusMu.Lock()
   775  			serverStatusMu.started = true
   776  			serverStatusMu.Unlock()
   777  
   778  			// Start up the update check loop.
   779  			// We don't do this in (*server.Server).Start() because we don't want it
   780  			// in tests.
   781  			if !cluster.TelemetryOptOut() {
   782  				s.PeriodicallyCheckForUpdates(ctx)
   783  			}
   784  
   785  			initialBoot := s.InitialBoot()
   786  
   787  			if disableReplication && initialBoot {
   788  				// For start-single-node, set the default replication factor to
   789  				// 1 so as to avoid warning message and unnecessary rebalance
   790  				// churn.
   791  				if err := cliDisableReplication(ctx, s); err != nil {
   792  					log.Errorf(ctx, "could not disable replication: %v", err)
   793  					return err
   794  				}
   795  				log.Shout(ctx, log.Severity_INFO,
   796  					"Replication was disabled for this cluster.\n"+
   797  						"When/if adding nodes in the future, update zone configurations to increase the replication factor.")
   798  			}
   799  
   800  			// Now inform the user that the server is running and tell the
   801  			// user about its run-time derived parameters.
   802  			var buf bytes.Buffer
   803  			info := build.GetInfo()
   804  			tw := tabwriter.NewWriter(&buf, 2, 1, 2, ' ', 0)
   805  			fmt.Fprintf(tw, "CockroachDB node starting at %s (took %0.1fs)\n", timeutil.Now(), timeutil.Since(tBegin).Seconds())
   806  			fmt.Fprintf(tw, "build:\t%s %s @ %s (%s)\n", info.Distribution, info.Tag, info.Time, info.GoVersion)
   807  			fmt.Fprintf(tw, "webui:\t%s\n", serverCfg.AdminURL())
   808  
   809  			// (Re-)compute the client connection URL. We cannot do this
   810  			// earlier (e.g. above, in the runStart function) because
   811  			// at this time the address and port have not been resolved yet.
   812  			pgURL, err := serverCfg.PGURL(url.User(security.RootUser))
   813  			if err != nil {
   814  				log.Errorf(ctx, "failed computing the URL: %v", err)
   815  				return err
   816  			}
   817  			fmt.Fprintf(tw, "sql:\t%s\n", pgURL)
   818  
   819  			fmt.Fprintf(tw, "RPC client flags:\t%s\n", clientFlagsRPC())
   820  			if len(serverCfg.SocketFile) != 0 {
   821  				fmt.Fprintf(tw, "socket:\t%s\n", serverCfg.SocketFile)
   822  			}
   823  			fmt.Fprintf(tw, "logs:\t%s\n", flag.Lookup("log-dir").Value)
   824  			if serverCfg.AuditLogDirName.IsSet() {
   825  				fmt.Fprintf(tw, "SQL audit logs:\t%s\n", serverCfg.AuditLogDirName)
   826  			}
   827  			if serverCfg.Attrs != "" {
   828  				fmt.Fprintf(tw, "attrs:\t%s\n", serverCfg.Attrs)
   829  			}
   830  			if len(serverCfg.Locality.Tiers) > 0 {
   831  				fmt.Fprintf(tw, "locality:\t%s\n", serverCfg.Locality)
   832  			}
   833  			if s.TempDir() != "" {
   834  				fmt.Fprintf(tw, "temp dir:\t%s\n", s.TempDir())
   835  			}
   836  			if ext := s.ClusterSettings().ExternalIODir; ext != "" {
   837  				fmt.Fprintf(tw, "external I/O path: \t%s\n", ext)
   838  			} else {
   839  				fmt.Fprintf(tw, "external I/O path: \t<disabled>\n")
   840  			}
   841  			for i, spec := range serverCfg.Stores.Specs {
   842  				fmt.Fprintf(tw, "store[%d]:\t%s\n", i, spec)
   843  			}
   844  			fmt.Fprintf(tw, "storage engine: \t%s\n", serverCfg.StorageEngine.String())
   845  			nodeID := s.NodeID()
   846  			if initialBoot {
   847  				if nodeID == server.FirstNodeID {
   848  					fmt.Fprintf(tw, "status:\tinitialized new cluster\n")
   849  				} else {
   850  					fmt.Fprintf(tw, "status:\tinitialized new node, joined pre-existing cluster\n")
   851  				}
   852  			} else {
   853  				fmt.Fprintf(tw, "status:\trestarted pre-existing node\n")
   854  			}
   855  
   856  			if baseCfg.ClusterName != "" {
   857  				fmt.Fprintf(tw, "cluster name:\t%s\n", baseCfg.ClusterName)
   858  			}
   859  
   860  			// Remember the cluster ID for log file rotation.
   861  			clusterID := s.ClusterID().String()
   862  			log.SetClusterID(clusterID)
   863  			fmt.Fprintf(tw, "clusterID:\t%s\n", clusterID)
   864  			fmt.Fprintf(tw, "nodeID:\t%d\n", nodeID)
   865  
   866  			// Collect the formatted string and show it to the user.
   867  			if err := tw.Flush(); err != nil {
   868  				return err
   869  			}
   870  			msg := buf.String()
   871  			log.Infof(ctx, "node startup completed:\n%s", msg)
   872  			if !startCtx.inBackground && !log.LoggingToStderr(log.Severity_INFO) {
   873  				fmt.Print(msg)
   874  			}
   875  
   876  			return nil
   877  		}(); err != nil {
   878  			errChan <- err
   879  		}
   880  	}()
   881  
   882  	// The remainder of the main function executes concurrently with the
   883  	// start up goroutine started above.
   884  	//
   885  	// It is concerned with determining when the server should stop
   886  	// because the main process is being shut down -- either via a stop
   887  	// message received from `cockroach quit` / `cockroach
   888  	// decommission`, or a signal.
   889  
   890  	// We'll want to log any shutdown activity against a separate span.
   891  	shutdownSpan := tracer.StartSpan("server shutdown")
   892  	defer shutdownSpan.Finish()
   893  	shutdownCtx := opentracing.ContextWithSpan(context.Background(), shutdownSpan)
   894  
   895  	// returnErr will be populated with the error to use to exit the
   896  	// process (reported to the shell).
   897  	var returnErr error
   898  
   899  	stopWithoutDrain := make(chan struct{}) // closed if interrupted very early
   900  
   901  	// Block until one of the signals above is received or the stopper
   902  	// is stopped externally (for example, via the quit endpoint).
   903  	select {
   904  	case err := <-errChan:
   905  		// SetSync both flushes and ensures that subsequent log writes are flushed too.
   906  		log.SetSync(true)
   907  		return err
   908  
   909  	case <-stopper.ShouldStop():
   910  		// Server is being stopped externally and our job is finished
   911  		// here since we don't know if it's a graceful shutdown or not.
   912  		<-stopper.IsStopped()
   913  		// SetSync both flushes and ensures that subsequent log writes are flushed too.
   914  		log.SetSync(true)
   915  		return nil
   916  
   917  	case sig := <-signalCh:
   918  		// We start synchronizing log writes from here, because if a
   919  		// signal was received there is a non-zero chance the sender of
   920  		// this signal will follow up with SIGKILL if the shutdown is not
   921  		// timely, and we don't want logs to be lost.
   922  		log.SetSync(true)
   923  
   924  		log.Infof(shutdownCtx, "received signal '%s'", sig)
   925  		switch sig {
   926  		case os.Interrupt:
   927  			// Graceful shutdown after an interrupt should cause the process
   928  			// to terminate with a non-zero exit code; however SIGTERM is
   929  			// "legitimate" and should be acknowledged with a success exit
   930  			// code. So we keep the error state here for later.
   931  			returnErr = &cliError{
   932  				exitCode: 1,
   933  				// INFO because a single interrupt is rather innocuous.
   934  				severity: log.Severity_INFO,
   935  				cause:    errors.New("interrupted"),
   936  			}
   937  			msgDouble := "Note: a second interrupt will skip graceful shutdown and terminate forcefully"
   938  			fmt.Fprintln(os.Stdout, msgDouble)
   939  
   940  		case quitSignal:
   941  			log.DumpStacks(shutdownCtx)
   942  		}
   943  
   944  		// Start the draining process in a separate goroutine so that it
   945  		// runs concurrently with the timeout check below.
   946  		go func() {
   947  			serverStatusMu.Lock()
   948  			serverStatusMu.draining = true
   949  			drainingIsSafe := serverStatusMu.started
   950  			serverStatusMu.Unlock()
   951  
   952  			// drainingIsSafe may have been set in the meantime, but that's ok.
   953  			// In the worst case, we're not draining a Server that has *just*
   954  			// started. Not desirable, but not terrible either.
   955  			if !drainingIsSafe {
   956  				close(stopWithoutDrain)
   957  				return
   958  			}
   959  			// Don't use shutdownCtx because this is in a goroutine that may
   960  			// still be running after shutdownCtx's span has been finished.
   961  			ac := log.AmbientContext{}
   962  			ac.AddLogTag("server drain process", nil)
   963  			drainCtx := ac.AnnotateCtx(context.Background())
   964  
   965  			// Perform a graceful drain. We keep retrying forever, in
   966  			// case there are many range leases or some unavailability
   967  			// preventing progress. If the operator wants to expedite
   968  			// the shutdown, they will need to make it ungraceful
   969  			// via a 2nd signal.
   970  			for {
   971  				remaining, _, err := s.Drain(drainCtx)
   972  				if err != nil {
   973  					log.Errorf(drainCtx, "graceful drain failed: %v", err)
   974  					break
   975  				}
   976  				if remaining == 0 {
   977  					// No more work to do.
   978  					break
   979  				}
   980  				// Avoid a busy wait with high CPU usage if the server replies
   981  				// with an incomplete drain too quickly.
   982  				time.Sleep(200 * time.Millisecond)
   983  			}
   984  
   985  			stopper.Stop(drainCtx)
   986  		}()
   987  
   988  	// Don't return: we're shutting down gracefully.
   989  
   990  	case <-log.FatalChan():
   991  		// A fatal error has occurred. Stop everything (gracelessly) to
   992  		// avoid serving incorrect data while the final log messages are
   993  		// being written.
   994  		// https://github.com/cockroachdb/cockroach/issues/23414
   995  		// TODO(bdarnell): This could be more graceless, for example by
   996  		// reaching into the server objects and closing all the
   997  		// connections while they're in use. That would be more in line
   998  		// with the expected effect of a log.Fatal.
   999  		stopper.Stop(shutdownCtx)
  1000  		// The logging goroutine is now responsible for killing this
  1001  		// process, so just block this goroutine.
  1002  		select {}
  1003  	}
  1004  
  1005  	// At this point, a signal has been received to shut down the
  1006  	// process, and a goroutine is busy telling the server to drain and
  1007  	// stop. From this point on, we just have to wait until the server
  1008  	// indicates it has stopped.
  1009  
  1010  	const msgDrain = "initiating graceful shutdown of server"
  1011  	log.Info(shutdownCtx, msgDrain)
  1012  	fmt.Fprintln(os.Stdout, msgDrain)
  1013  
  1014  	// Notify the user every 5 second of the shutdown progress.
  1015  	go func() {
  1016  		ticker := time.NewTicker(5 * time.Second)
  1017  		defer ticker.Stop()
  1018  		for {
  1019  			select {
  1020  			case <-ticker.C:
  1021  				log.Infof(context.Background(), "%d running tasks", stopper.NumTasks())
  1022  			case <-stopper.ShouldStop():
  1023  				return
  1024  			case <-stopWithoutDrain:
  1025  				return
  1026  			}
  1027  		}
  1028  	}()
  1029  
  1030  	// Meanwhile, we don't want to wait too long either, in case the
  1031  	// server is getting stuck and doesn't shut down in a timely manner.
  1032  	//
  1033  	// So we also pay attention to any additional signal received beyond
  1034  	// this point (maybe some service monitor was impatient and sends
  1035  	// another signal to hasten the shutdown process).
  1036  	//
  1037  	// If any such trigger to hasten occurs, we simply return, which
  1038  	// will cause the process to exit and the server goroutines to be
  1039  	// forcefully terminated.
  1040  
  1041  	const hardShutdownHint = " - node may take longer to restart & clients may need to wait for leases to expire"
  1042  	select {
  1043  	case sig := <-signalCh:
  1044  		// This new signal is not welcome, as it interferes with the graceful
  1045  		// shutdown process.
  1046  		log.Shoutf(shutdownCtx, log.Severity_ERROR,
  1047  			"received signal '%s' during shutdown, initiating hard shutdown%s",
  1048  			log.Safe(sig), log.Safe(hardShutdownHint))
  1049  		handleSignalDuringShutdown(sig)
  1050  		panic("unreachable")
  1051  
  1052  	case <-stopper.IsStopped():
  1053  		const msgDone = "server drained and shutdown completed"
  1054  		log.Infof(shutdownCtx, msgDone)
  1055  		fmt.Fprintln(os.Stdout, msgDone)
  1056  
  1057  	case <-stopWithoutDrain:
  1058  		const msgDone = "too early to drain; used hard shutdown instead"
  1059  		log.Infof(shutdownCtx, msgDone)
  1060  		fmt.Fprintln(os.Stdout, msgDone)
  1061  	}
  1062  
  1063  	return returnErr
  1064  }
  1065  
  1066  func hintServerCmdFlags(ctx context.Context, cmd *cobra.Command) {
  1067  	pf := flagSetForCmd(cmd)
  1068  
  1069  	listenAddrSpecified := pf.Lookup(cliflags.ListenAddr.Name).Changed || pf.Lookup(cliflags.ServerHost.Name).Changed
  1070  	advAddrSpecified := pf.Lookup(cliflags.AdvertiseAddr.Name).Changed || pf.Lookup(cliflags.AdvertiseHost.Name).Changed
  1071  
  1072  	if !listenAddrSpecified && !advAddrSpecified {
  1073  		host, _, _ := net.SplitHostPort(serverCfg.AdvertiseAddr)
  1074  		log.Shoutf(ctx, log.Severity_WARNING,
  1075  			"neither --listen-addr nor --advertise-addr was specified.\n"+
  1076  				"The server will advertise %q to other nodes, is this routable?\n\n"+
  1077  				"Consider using:\n"+
  1078  				"- for local-only servers:  --listen-addr=localhost\n"+
  1079  				"- for multi-node clusters: --advertise-addr=<host/IP addr>\n", host)
  1080  	}
  1081  }
  1082  
  1083  func clientFlagsRPC() string {
  1084  	flags := []string{os.Args[0], "<client cmd>"}
  1085  	if serverCfg.AdvertiseAddr != "" {
  1086  		flags = append(flags, "--host="+serverCfg.AdvertiseAddr)
  1087  	}
  1088  	if startCtx.serverInsecure {
  1089  		flags = append(flags, "--insecure")
  1090  	} else {
  1091  		flags = append(flags, "--certs-dir="+startCtx.serverSSLCertsDir)
  1092  	}
  1093  	return strings.Join(flags, " ")
  1094  }
  1095  
  1096  func checkTzDatabaseAvailability(ctx context.Context) error {
  1097  	if _, err := timeutil.LoadLocation("America/New_York"); err != nil {
  1098  		log.Errorf(ctx, "timeutil.LoadLocation: %v", err)
  1099  		reportedErr := errors.WithHint(
  1100  			errors.WithIssueLink(
  1101  				errors.New("unable to load named timezones"),
  1102  				errors.IssueLink{IssueURL: unimplemented.MakeURL(36864)}),
  1103  			"Check that the time zone database is installed on your system, or\n"+
  1104  				"set the ZONEINFO environment variable to a Go time zone .zip archive.")
  1105  
  1106  		if envutil.EnvOrDefaultBool("COCKROACH_INCONSISTENT_TIME_ZONES", false) {
  1107  			// The user tells us they really know what they want.
  1108  			reportedErr := &formattedError{err: reportedErr}
  1109  			log.Shoutf(ctx, log.Severity_WARNING, "%v", reportedErr)
  1110  		} else {
  1111  			// Prevent a successful start.
  1112  			//
  1113  			// In the past, we were simply using log.Shout to emit an error,
  1114  			// informing the user that startup could continue with degraded
  1115  			// behavior.  However, usage demonstrated that users typically do
  1116  			// not see the error and instead run into silently incorrect SQL
  1117  			// results. To avoid this situation altogether, it's better to
  1118  			// stop early.
  1119  			return reportedErr
  1120  		}
  1121  	}
  1122  	return nil
  1123  }
  1124  
  1125  func reportConfiguration(ctx context.Context) {
  1126  	serverCfg.Report(ctx)
  1127  	if envVarsUsed := envutil.GetEnvVarsUsed(); len(envVarsUsed) > 0 {
  1128  		log.Infof(ctx, "using local environment variables: %s", strings.Join(envVarsUsed, ", "))
  1129  	}
  1130  	// If a user ever reports "bad things have happened", any
  1131  	// troubleshooting steps will want to rule out that the user was
  1132  	// running as root in a multi-user environment, or using different
  1133  	// uid/gid across runs in the same data directory. To determine
  1134  	// this, it's easier if the information appears in the log file.
  1135  	log.Infof(ctx, "process identity: %s", sysutil.ProcessIdentity())
  1136  }
  1137  
  1138  func maybeWarnMemorySizes(ctx context.Context) {
  1139  	// Is the cache configuration OK?
  1140  	if !cacheSizeValue.IsSet() {
  1141  		var buf bytes.Buffer
  1142  		fmt.Fprintf(&buf, "Using the default setting for --cache (%s).\n", cacheSizeValue)
  1143  		fmt.Fprintf(&buf, "  A significantly larger value is usually needed for good performance.\n")
  1144  		if size, err := status.GetTotalMemory(context.Background()); err == nil {
  1145  			fmt.Fprintf(&buf, "  If you have a dedicated server a reasonable setting is --cache=.25 (%s).",
  1146  				humanizeutil.IBytes(size/4))
  1147  		} else {
  1148  			fmt.Fprintf(&buf, "  If you have a dedicated server a reasonable setting is 25%% of physical memory.")
  1149  		}
  1150  		log.Warningf(ctx, "%s", buf.String())
  1151  	}
  1152  
  1153  	// Check that the total suggested "max" memory is well below the available memory.
  1154  	if maxMemory, err := status.GetTotalMemory(ctx); err == nil {
  1155  		requestedMem := serverCfg.CacheSize + serverCfg.MemoryPoolSize
  1156  		maxRecommendedMem := int64(.75 * float64(maxMemory))
  1157  		if requestedMem > maxRecommendedMem {
  1158  			log.Shoutf(ctx, log.Severity_WARNING,
  1159  				"the sum of --max-sql-memory (%s) and --cache (%s) is larger than 75%% of total RAM (%s).\nThis server is running at increased risk of memory-related failures.",
  1160  				sqlSizeValue, cacheSizeValue, humanizeutil.IBytes(maxRecommendedMem))
  1161  		}
  1162  	}
  1163  }
  1164  
  1165  func logOutputDirectory() string {
  1166  	return startCtx.logDir.String()
  1167  }
  1168  
  1169  // setupAndInitializeLoggingAndProfiling does what it says on the label.
  1170  // Prior to this however it determines suitable defaults for the
  1171  // logging output directory and the verbosity level of stderr logging.
  1172  // We only do this for the "start" command which is why this work
  1173  // occurs here and not in an OnInitialize function.
  1174  func setupAndInitializeLoggingAndProfiling(
  1175  	ctx context.Context, cmd *cobra.Command,
  1176  ) (stopper *stop.Stopper, err error) {
  1177  	// Default the log directory to the "logs" subdirectory of the first
  1178  	// non-memory store. If more than one non-memory stores is detected,
  1179  	// print a warning.
  1180  	ambiguousLogDirs := false
  1181  	lf := cmd.Flags().Lookup(logflags.LogDirName)
  1182  	if !startCtx.logDir.IsSet() && !lf.Changed {
  1183  		// We only override the log directory if the user has not explicitly
  1184  		// disabled file logging using --log-dir="".
  1185  		newDir := ""
  1186  		for _, spec := range serverCfg.Stores.Specs {
  1187  			if spec.InMemory {
  1188  				continue
  1189  			}
  1190  			if newDir != "" {
  1191  				ambiguousLogDirs = true
  1192  				break
  1193  			}
  1194  			newDir = filepath.Join(spec.Path, "logs")
  1195  		}
  1196  		if err := startCtx.logDir.Set(newDir); err != nil {
  1197  			return nil, err
  1198  		}
  1199  	}
  1200  
  1201  	if logDir := startCtx.logDir.String(); logDir != "" {
  1202  		ls := cockroachCmd.PersistentFlags().Lookup(logflags.LogToStderrName)
  1203  		if !ls.Changed {
  1204  			// Unless the settings were overridden by the user, silence
  1205  			// logging to stderr because the messages will go to a log file.
  1206  			if err := ls.Value.Set(log.Severity_NONE.String()); err != nil {
  1207  				return nil, err
  1208  			}
  1209  		}
  1210  
  1211  		// Make sure the path exists.
  1212  		if err := os.MkdirAll(logDir, 0755); err != nil {
  1213  			return nil, errors.Wrap(err, "unable to create log directory")
  1214  		}
  1215  
  1216  		// Note that we configured the --log-dir flag to set
  1217  		// startContext.logDir. This is the point at which we set log-dir for the
  1218  		// util/log package. We don't want to set it earlier to avoid spuriously
  1219  		// creating a file in an incorrect log directory or if something is
  1220  		// accidentally logging after flag parsing but before the --background
  1221  		// dispatch has occurred.
  1222  		if err := flag.Lookup(logflags.LogDirName).Value.Set(logDir); err != nil {
  1223  			return nil, err
  1224  		}
  1225  
  1226  		// NB: this message is a crutch until #33458 is addressed. Without it,
  1227  		// the calls to log.Shout below can be the first use of logging, hitting
  1228  		// the bug described in the issue.
  1229  		log.Infof(ctx, "logging to directory %s", logDir)
  1230  
  1231  		// Start the log file GC daemon to remove files that make the log
  1232  		// directory too large.
  1233  		log.StartGCDaemon(ctx)
  1234  
  1235  		defer func() {
  1236  			if stopper != nil {
  1237  				// When the function complete successfully, start the loggers
  1238  				// for the storage engines. We need to do this at the end
  1239  				// because we need to register the loggers.
  1240  				stopper.AddCloser(storage.InitPebbleLogger(ctx))
  1241  				stopper.AddCloser(storage.InitRocksDBLogger(ctx))
  1242  			}
  1243  		}()
  1244  	}
  1245  
  1246  	// We want to be careful to still produce useful debug dumps if the
  1247  	// server configuration has disabled logging to files.
  1248  	outputDirectory := "."
  1249  	if p := logOutputDirectory(); p != "" {
  1250  		outputDirectory = p
  1251  	}
  1252  	serverCfg.GoroutineDumpDirName = filepath.Join(outputDirectory, base.GoroutineDumpDir)
  1253  	serverCfg.HeapProfileDirName = filepath.Join(outputDirectory, base.HeapProfileDir)
  1254  
  1255  	if ambiguousLogDirs {
  1256  		// Note that we can't report this message earlier, because the log directory
  1257  		// may not have been ready before the call to MkdirAll() above.
  1258  		log.Shout(ctx, log.Severity_WARNING, "multiple stores configured"+
  1259  			" and --log-dir not specified, you may want to specify --log-dir to disambiguate.")
  1260  	}
  1261  
  1262  	if auditLogDir := serverCfg.AuditLogDirName.String(); auditLogDir != "" && auditLogDir != outputDirectory {
  1263  		// Make sure the path for the audit log exists, if it's a different path than
  1264  		// the main log.
  1265  		if err := os.MkdirAll(auditLogDir, 0755); err != nil {
  1266  			return nil, err
  1267  		}
  1268  		log.Eventf(ctx, "created SQL audit log directory %s", auditLogDir)
  1269  	}
  1270  
  1271  	if startCtx.serverInsecure {
  1272  		// Use a non-annotated context here since the annotation just looks funny,
  1273  		// particularly to new users (made worse by it always printing as [n?]).
  1274  		addr := startCtx.serverListenAddr
  1275  		if addr == "" {
  1276  			addr = "<all your IP addresses>"
  1277  		}
  1278  		log.Shoutf(context.Background(), log.Severity_WARNING,
  1279  			"RUNNING IN INSECURE MODE!\n\n"+
  1280  				"- Your cluster is open for any client that can access %s.\n"+
  1281  				"- Any user, even root, can log in without providing a password.\n"+
  1282  				"- Any user, connecting as root, can read or write any data in your cluster.\n"+
  1283  				"- There is no network encryption nor authentication, and thus no confidentiality.\n\n"+
  1284  				"Check out how to secure your cluster: %s",
  1285  			addr, log.Safe(base.DocsURL("secure-a-cluster.html")))
  1286  	}
  1287  
  1288  	maybeWarnMemorySizes(ctx)
  1289  
  1290  	// We log build information to stdout (for the short summary), but also
  1291  	// to stderr to coincide with the full logs.
  1292  	info := build.GetInfo()
  1293  	log.Infof(ctx, "%s", info.Short())
  1294  
  1295  	initMemProfile(ctx, outputDirectory)
  1296  	initCPUProfile(ctx, outputDirectory)
  1297  	initBlockProfile()
  1298  	initMutexProfile()
  1299  
  1300  	// Disable Stopper task tracking as performing that call site tracking is
  1301  	// moderately expensive (certainly outweighing the infrequent benefit it
  1302  	// provides).
  1303  	stopper = stop.NewStopper()
  1304  	log.Event(ctx, "initialized profiles")
  1305  
  1306  	return stopper, nil
  1307  }
  1308  
  1309  func addrWithDefaultHost(addr string) (string, error) {
  1310  	host, port, err := net.SplitHostPort(addr)
  1311  	if err != nil {
  1312  		return "", err
  1313  	}
  1314  	if host == "" {
  1315  		host = "localhost"
  1316  	}
  1317  	return net.JoinHostPort(host, port), nil
  1318  }
  1319  
  1320  // getClientGRPCConn returns a ClientConn, a Clock and a method that blocks
  1321  // until the connection (and its associated goroutines) have terminated.
  1322  func getClientGRPCConn(
  1323  	ctx context.Context, cfg server.Config,
  1324  ) (*grpc.ClientConn, *hlc.Clock, func(), error) {
  1325  	if ctx.Done() == nil {
  1326  		return nil, nil, nil, errors.New("context must be cancellable")
  1327  	}
  1328  	// 0 to disable max offset checks; this RPC context is not a member of the
  1329  	// cluster, so there's no need to enforce that its max offset is the same
  1330  	// as that of nodes in the cluster.
  1331  	clock := hlc.NewClock(hlc.UnixNano, 0)
  1332  	stopper := stop.NewStopper()
  1333  	rpcContext := rpc.NewContext(
  1334  		log.AmbientContext{Tracer: cfg.Settings.Tracer},
  1335  		cfg.Config,
  1336  		clock,
  1337  		stopper,
  1338  		cfg.Settings,
  1339  	)
  1340  	addr, err := addrWithDefaultHost(cfg.AdvertiseAddr)
  1341  	if err != nil {
  1342  		stopper.Stop(ctx)
  1343  		return nil, nil, nil, err
  1344  	}
  1345  	// We use GRPCUnvalidatedDial() here because it does not matter
  1346  	// to which node we're talking to.
  1347  	conn, err := rpcContext.GRPCUnvalidatedDial(addr).Connect(ctx)
  1348  	if err != nil {
  1349  		stopper.Stop(ctx)
  1350  		return nil, nil, nil, err
  1351  	}
  1352  	stopper.AddCloser(stop.CloserFn(func() {
  1353  		_ = conn.Close()
  1354  	}))
  1355  
  1356  	// Tie the lifetime of the stopper to that of the context.
  1357  	closer := func() {
  1358  		stopper.Stop(ctx)
  1359  	}
  1360  	return conn, clock, closer, nil
  1361  }