github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cli/start.go (about) 1 // Copyright 2015 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package cli 12 13 import ( 14 "bytes" 15 "context" 16 "flag" 17 "fmt" 18 "io/ioutil" 19 "net" 20 "net/url" 21 "os" 22 "os/signal" 23 "path/filepath" 24 "runtime" 25 "runtime/pprof" 26 "strings" 27 "text/tabwriter" 28 "time" 29 30 "github.com/cockroachdb/cockroach/pkg/base" 31 "github.com/cockroachdb/cockroach/pkg/build" 32 "github.com/cockroachdb/cockroach/pkg/cli/cliflags" 33 "github.com/cockroachdb/cockroach/pkg/geo/geos" 34 "github.com/cockroachdb/cockroach/pkg/rpc" 35 "github.com/cockroachdb/cockroach/pkg/security" 36 "github.com/cockroachdb/cockroach/pkg/server" 37 "github.com/cockroachdb/cockroach/pkg/server/status" 38 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 39 "github.com/cockroachdb/cockroach/pkg/storage" 40 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 41 "github.com/cockroachdb/cockroach/pkg/util/envutil" 42 "github.com/cockroachdb/cockroach/pkg/util/errorutil/unimplemented" 43 "github.com/cockroachdb/cockroach/pkg/util/grpcutil" 44 "github.com/cockroachdb/cockroach/pkg/util/hlc" 45 "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" 46 "github.com/cockroachdb/cockroach/pkg/util/log" 47 "github.com/cockroachdb/cockroach/pkg/util/log/logflags" 48 "github.com/cockroachdb/cockroach/pkg/util/sdnotify" 49 "github.com/cockroachdb/cockroach/pkg/util/stop" 50 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 51 "github.com/cockroachdb/cockroach/pkg/util/sysutil" 52 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 53 "github.com/cockroachdb/cockroach/pkg/util/tracing" 54 "github.com/cockroachdb/errors" 55 "github.com/cockroachdb/pebble" 56 opentracing "github.com/opentracing/opentracing-go" 57 "github.com/spf13/cobra" 58 "google.golang.org/grpc" 59 ) 60 61 // jemallocHeapDump is an optional function to be called at heap dump time. 62 // This will be non-nil when jemalloc is linked in with profiling enabled. 63 // The function takes a filename to write the profile to. 64 var jemallocHeapDump func(string) error 65 66 // startCmd starts a node by initializing the stores and joining 67 // the cluster. 68 var startCmd = &cobra.Command{ 69 Use: "start", 70 Short: "start a node in a multi-node cluster", 71 Long: ` 72 Start a CockroachDB node, which will export data from one or more 73 storage devices, specified via --store flags. 74 75 Specify the --join flag to point to another node or nodes that are 76 part of the same cluster. The other nodes do not need to be started 77 yet, and if the address of the other nodes to be added are not yet 78 known it is legal for the first node to join itself. 79 80 If --join is not specified, the cluster will also be initialized. 81 THIS BEHAVIOR IS DEPRECATED; consider using 'cockroach init' or 82 'cockroach start-single-node' instead. 83 `, 84 Example: ` cockroach start --insecure --store=attrs=ssd,path=/mnt/ssd1 --join=host:port,[host:port]`, 85 Args: cobra.NoArgs, 86 RunE: maybeShoutError(MaybeDecorateGRPCError(runStartJoin)), 87 } 88 89 // startSingleNodeCmd starts a node by initializing the stores. 90 var startSingleNodeCmd = &cobra.Command{ 91 Use: "start-single-node", 92 Short: "start a single-node cluster", 93 Long: ` 94 Start a CockroachDB node, which will export data from one or more 95 storage devices, specified via --store flags. 96 The cluster will also be automatically initialized with 97 replication disabled (replication factor = 1). 98 `, 99 Example: ` cockroach start-single-node --insecure --store=attrs=ssd,path=/mnt/ssd1`, 100 Args: cobra.NoArgs, 101 RunE: maybeShoutError(MaybeDecorateGRPCError(runStartSingleNode)), 102 } 103 104 // StartCmds exports startCmd and startSingleNodeCmds so that other 105 // packages can add flags to them. 106 var StartCmds = []*cobra.Command{startCmd, startSingleNodeCmd} 107 108 // maxSizePerProfile is the maximum total size in bytes for profiles per 109 // profile type. 110 var maxSizePerProfile = envutil.EnvOrDefaultInt64( 111 "COCKROACH_MAX_SIZE_PER_PROFILE", 100<<20 /* 100 MB */) 112 113 // gcProfiles removes old profiles matching the specified prefix when the sum 114 // of newer profiles is larger than maxSize. Requires that the suffix used for 115 // the profiles indicates age (e.g. by using a date/timestamp suffix) such that 116 // sorting the filenames corresponds to ordering the profiles from oldest to 117 // newest. 118 func gcProfiles(dir, prefix string, maxSize int64) { 119 files, err := ioutil.ReadDir(dir) 120 if err != nil { 121 log.Warningf(context.Background(), "%v", err) 122 return 123 } 124 var sum int64 125 var found int 126 for i := len(files) - 1; i >= 0; i-- { 127 f := files[i] 128 if !f.Mode().IsRegular() { 129 continue 130 } 131 if !strings.HasPrefix(f.Name(), prefix) { 132 continue 133 } 134 found++ 135 sum += f.Size() 136 if found == 1 { 137 // Always keep the most recent profile. 138 continue 139 } 140 if sum <= maxSize { 141 continue 142 } 143 if err := os.Remove(filepath.Join(dir, f.Name())); err != nil { 144 log.Infof(context.Background(), "%v", err) 145 } 146 } 147 } 148 149 func initMemProfile(ctx context.Context, dir string) { 150 const jeprof = "jeprof." 151 const memprof = "memprof." 152 153 gcProfiles(dir, jeprof, maxSizePerProfile) 154 gcProfiles(dir, memprof, maxSizePerProfile) 155 156 memProfileInterval := envutil.EnvOrDefaultDuration("COCKROACH_MEMPROF_INTERVAL", -1) 157 if memProfileInterval <= 0 { 158 return 159 } 160 if min := time.Second; memProfileInterval < min { 161 log.Infof(ctx, "fixing excessively short memory profiling interval: %s -> %s", 162 memProfileInterval, min) 163 memProfileInterval = min 164 } 165 166 if jemallocHeapDump != nil { 167 log.Infof(ctx, "writing go and jemalloc memory profiles to %s every %s", dir, memProfileInterval) 168 } else { 169 log.Infof(ctx, "writing go only memory profiles to %s every %s", dir, memProfileInterval) 170 log.Infof(ctx, `to enable jmalloc profiling: "export MALLOC_CONF=prof:true" or "ln -s prof:true /etc/malloc.conf"`) 171 } 172 173 go func() { 174 ctx := context.Background() 175 t := time.NewTicker(memProfileInterval) 176 defer t.Stop() 177 178 for { 179 <-t.C 180 181 func() { 182 const format = "2006-01-02T15_04_05.999" 183 suffix := timeutil.Now().Format(format) 184 185 // Try jemalloc heap profile first, we only log errors. 186 if jemallocHeapDump != nil { 187 jepath := filepath.Join(dir, jeprof+suffix) 188 if err := jemallocHeapDump(jepath); err != nil { 189 log.Warningf(ctx, "error writing jemalloc heap %s: %s", jepath, err) 190 } 191 gcProfiles(dir, jeprof, maxSizePerProfile) 192 } 193 194 path := filepath.Join(dir, memprof+suffix) 195 // Try writing a go heap profile. 196 f, err := os.Create(path) 197 if err != nil { 198 log.Warningf(ctx, "error creating go heap file %s", err) 199 return 200 } 201 defer f.Close() 202 if err = pprof.WriteHeapProfile(f); err != nil { 203 log.Warningf(ctx, "error writing go heap %s: %s", path, err) 204 return 205 } 206 gcProfiles(dir, memprof, maxSizePerProfile) 207 }() 208 } 209 }() 210 } 211 212 func initCPUProfile(ctx context.Context, dir string) { 213 const cpuprof = "cpuprof." 214 gcProfiles(dir, cpuprof, maxSizePerProfile) 215 216 cpuProfileInterval := envutil.EnvOrDefaultDuration("COCKROACH_CPUPROF_INTERVAL", -1) 217 if cpuProfileInterval <= 0 { 218 return 219 } 220 if min := time.Second; cpuProfileInterval < min { 221 log.Infof(ctx, "fixing excessively short cpu profiling interval: %s -> %s", 222 cpuProfileInterval, min) 223 cpuProfileInterval = min 224 } 225 226 go func() { 227 defer log.RecoverAndReportPanic(ctx, &serverCfg.Settings.SV) 228 229 ctx := context.Background() 230 231 t := time.NewTicker(cpuProfileInterval) 232 defer t.Stop() 233 234 var currentProfile *os.File 235 defer func() { 236 if currentProfile != nil { 237 pprof.StopCPUProfile() 238 currentProfile.Close() 239 } 240 }() 241 242 for { 243 func() { 244 const format = "2006-01-02T15_04_05.999" 245 suffix := timeutil.Now().Add(cpuProfileInterval).Format(format) 246 f, err := os.Create(filepath.Join(dir, cpuprof+suffix)) 247 if err != nil { 248 log.Warningf(ctx, "error creating go cpu file %s", err) 249 return 250 } 251 252 // Stop the current profile if it exists. 253 if currentProfile != nil { 254 pprof.StopCPUProfile() 255 currentProfile.Close() 256 currentProfile = nil 257 gcProfiles(dir, cpuprof, maxSizePerProfile) 258 } 259 260 // Start the new profile. 261 if err := pprof.StartCPUProfile(f); err != nil { 262 log.Warningf(ctx, "unable to start cpu profile: %v", err) 263 f.Close() 264 return 265 } 266 currentProfile = f 267 }() 268 269 <-t.C 270 } 271 }() 272 } 273 274 func initBlockProfile() { 275 // Enable the block profile for a sample of mutex and channel operations. 276 // Smaller values provide more accurate profiles but are more 277 // expensive. 0 and 1 are special: 0 disables the block profile and 278 // 1 captures 100% of block events. For other values, the profiler 279 // will sample one event per X nanoseconds spent blocking. 280 // 281 // The block profile can be viewed with `pprof http://HOST:PORT/debug/pprof/block` 282 // 283 // The utility of the block profile (aka blocking profile) has diminished 284 // with the advent of the mutex profile. We currently leave the block profile 285 // disabled by default as it has a non-zero performance impact. 286 d := envutil.EnvOrDefaultInt64("COCKROACH_BLOCK_PROFILE_RATE", 0) 287 runtime.SetBlockProfileRate(int(d)) 288 } 289 290 func initMutexProfile() { 291 // Enable the mutex profile for a fraction of mutex contention events. 292 // Smaller values provide more accurate profiles but are more expensive. 0 293 // and 1 are special: 0 disables the mutex profile and 1 captures 100% of 294 // mutex contention events. For other values, the profiler will sample on 295 // average 1/X events. 296 // 297 // The mutex profile can be viewed with `pprof http://HOST:PORT/debug/pprof/mutex` 298 d := envutil.EnvOrDefaultInt("COCKROACH_MUTEX_PROFILE_RATE", 299 1000 /* 1 sample per 1000 mutex contention events */) 300 runtime.SetMutexProfileFraction(d) 301 } 302 303 var cacheSizeValue = newBytesOrPercentageValue(&serverCfg.CacheSize, memoryPercentResolver) 304 var sqlSizeValue = newBytesOrPercentageValue(&serverCfg.MemoryPoolSize, memoryPercentResolver) 305 var diskTempStorageSizeValue = newBytesOrPercentageValue(nil /* v */, nil /* percentResolver */) 306 307 func initExternalIODir(ctx context.Context, firstStore base.StoreSpec) (string, error) { 308 externalIODir := startCtx.externalIODir 309 if externalIODir == "" && !firstStore.InMemory { 310 externalIODir = filepath.Join(firstStore.Path, "extern") 311 } 312 if externalIODir == "" || externalIODir == "disabled" { 313 return "", nil 314 } 315 if !filepath.IsAbs(externalIODir) { 316 return "", errors.Errorf("%s path must be absolute", cliflags.ExternalIODir.Name) 317 } 318 return externalIODir, nil 319 } 320 321 func initTempStorageConfig( 322 ctx context.Context, st *cluster.Settings, stopper *stop.Stopper, useStore base.StoreSpec, 323 ) (base.TempStorageConfig, error) { 324 var recordPath string 325 if !useStore.InMemory { 326 recordPath = filepath.Join(useStore.Path, server.TempDirsRecordFilename) 327 } 328 329 var err error 330 // Need to first clean up any abandoned temporary directories from 331 // the temporary directory record file before creating any new 332 // temporary directories in case the disk is completely full. 333 if recordPath != "" { 334 if err = storage.CleanupTempDirs(recordPath); err != nil { 335 return base.TempStorageConfig{}, errors.Wrap(err, "could not cleanup temporary directories from record file") 336 } 337 } 338 339 // The temp store size can depend on the location of the first regular store 340 // (if it's expressed as a percentage), so we resolve that flag here. 341 var tempStorePercentageResolver percentResolverFunc 342 if !useStore.InMemory { 343 dir := useStore.Path 344 // Create the store dir, if it doesn't exist. The dir is required to exist 345 // by diskPercentResolverFactory. 346 if err = os.MkdirAll(dir, 0755); err != nil { 347 return base.TempStorageConfig{}, errors.Wrapf(err, "failed to create dir for first store: %s", dir) 348 } 349 tempStorePercentageResolver, err = diskPercentResolverFactory(dir) 350 if err != nil { 351 return base.TempStorageConfig{}, errors.Wrapf(err, "failed to create resolver for: %s", dir) 352 } 353 } else { 354 tempStorePercentageResolver = memoryPercentResolver 355 } 356 var tempStorageMaxSizeBytes int64 357 if err = diskTempStorageSizeValue.Resolve( 358 &tempStorageMaxSizeBytes, tempStorePercentageResolver, 359 ); err != nil { 360 return base.TempStorageConfig{}, err 361 } 362 if !diskTempStorageSizeValue.IsSet() { 363 // The default temp storage size is different when the temp 364 // storage is in memory (which occurs when no temp directory 365 // is specified and the first store is in memory). 366 if startCtx.tempDir == "" && useStore.InMemory { 367 tempStorageMaxSizeBytes = base.DefaultInMemTempStorageMaxSizeBytes 368 } else { 369 tempStorageMaxSizeBytes = base.DefaultTempStorageMaxSizeBytes 370 } 371 } 372 373 // Initialize a base.TempStorageConfig based on first store's spec and 374 // cli flags. 375 tempStorageConfig := base.TempStorageConfigFromEnv( 376 ctx, 377 st, 378 useStore, 379 startCtx.tempDir, 380 tempStorageMaxSizeBytes, 381 ) 382 383 // Set temp directory to first store's path if the temp storage is not 384 // in memory. 385 tempDir := startCtx.tempDir 386 if tempDir == "" && !tempStorageConfig.InMemory { 387 tempDir = useStore.Path 388 } 389 // Create the temporary subdirectory for the temp engine. 390 if tempStorageConfig.Path, err = storage.CreateTempDir(tempDir, server.TempDirPrefix, stopper); err != nil { 391 return base.TempStorageConfig{}, errors.Wrap(err, "could not create temporary directory for temp storage") 392 } 393 394 // We record the new temporary directory in the record file (if it 395 // exists) for cleanup in case the node crashes. 396 if recordPath != "" { 397 if err = storage.RecordTempDir(recordPath, tempStorageConfig.Path); err != nil { 398 return base.TempStorageConfig{}, errors.Wrapf( 399 err, 400 "could not record temporary directory path to record file: %s", 401 recordPath, 402 ) 403 } 404 } 405 406 return tempStorageConfig, nil 407 } 408 409 // Checks if the passed-in engine type is default, and if so, resolves it to 410 // the storage engine last used to write to the store at dir (or rocksdb if 411 // a store wasn't found). 412 func resolveStorageEngineType( 413 ctx context.Context, engineType enginepb.EngineType, cfg base.StorageConfig, 414 ) enginepb.EngineType { 415 if engineType == enginepb.EngineTypeDefault { 416 engineType = enginepb.EngineTypePebble 417 pebbleCfg := &storage.PebbleConfig{ 418 StorageConfig: cfg, 419 Opts: storage.DefaultPebbleOptions(), 420 } 421 pebbleCfg.Opts.EnsureDefaults() 422 pebbleCfg.Opts.ReadOnly = true 423 // Resolve encrypted env options in pebbleCfg and populate pebbleCfg.Opts.FS 424 // if necessary (eg. encrypted-at-rest is enabled). 425 _, _, err := storage.ResolveEncryptedEnvOptions(pebbleCfg) 426 if err != nil { 427 log.Infof(ctx, "unable to setup encrypted env to resolve past engine type: %s", err) 428 return engineType 429 } 430 431 // Check if this storage directory was last written to by rocksdb. In that 432 // case, default to opening a RocksDB engine. 433 if version, err := pebble.GetVersion(cfg.Dir, pebbleCfg.Opts.FS); err == nil { 434 if strings.HasPrefix(version, "rocksdb") { 435 engineType = enginepb.EngineTypeRocksDB 436 } 437 } 438 } 439 return engineType 440 } 441 442 var errCannotUseJoin = errors.New("cannot use --join with 'cockroach start-single-node' -- use 'cockroach start' instead") 443 444 func runStartSingleNode(cmd *cobra.Command, args []string) error { 445 joinFlag := flagSetForCmd(cmd).Lookup(cliflags.Join.Name) 446 if joinFlag.Changed { 447 return errCannotUseJoin 448 } 449 // Now actually set the flag as changed so that the start code 450 // doesn't warn that it was not set. 451 joinFlag.Changed = true 452 return runStart(cmd, args, true /*disableReplication*/) 453 } 454 455 func runStartJoin(cmd *cobra.Command, args []string) error { 456 return runStart(cmd, args, false /*disableReplication*/) 457 } 458 459 // runStart starts the cockroach node using --store as the list of 460 // storage devices ("stores") on this machine and --join as the list 461 // of other active nodes used to join this node to the cockroach 462 // cluster, if this is its first time connecting. 463 // 464 // If the argument disableReplication is true and we are starting 465 // a fresh cluster, the replication factor will be disabled in 466 // all zone configs. 467 func runStart(cmd *cobra.Command, args []string, disableReplication bool) error { 468 tBegin := timeutil.Now() 469 470 // First things first: if the user wants background processing, 471 // relinquish the terminal ASAP by forking and exiting. 472 // 473 // If executing in the background, the function returns ok == true in 474 // the parent process (regardless of err) and the parent exits at 475 // this point. 476 if ok, err := maybeRerunBackground(); ok { 477 return err 478 } 479 480 // Change the permission mask for all created files. 481 // 482 // We're considering everything produced by a cockroach node 483 // to potentially contain sensitive information, so it should 484 // not be world-readable. 485 disableOtherPermissionBits() 486 487 // Set up the signal handlers. This also ensures that any of these 488 // signals received beyond this point do not interrupt the startup 489 // sequence until the point signals are checked below. 490 // We want to set up signal handling before starting logging, because 491 // logging uses buffering, and we want to be able to sync 492 // the buffers in the signal handler below. If we started capturing 493 // signals later, some startup logging might be lost. 494 signalCh := make(chan os.Signal, 1) 495 signal.Notify(signalCh, drainSignals...) 496 497 // Set up a cancellable context for the entire start command. 498 // The context will be canceled at the end. 499 ctx, cancel := context.WithCancel(context.Background()) 500 defer cancel() 501 502 // Set up a tracing span for the start process. We want any logging 503 // happening beyond this point to be accounted to this start 504 // context, including logging related to the initialization of 505 // the logging infrastructure below. 506 // This span concludes when the startup goroutine started below 507 // has completed. 508 // TODO(andrei): we don't close the span on the early returns below. 509 tracer := serverCfg.Settings.Tracer 510 sp := tracer.StartRootSpan("server start", nil /* logTags */, tracing.NonRecordableSpan) 511 ctx = opentracing.ContextWithSpan(ctx, sp) 512 513 // Set up the logging and profiling output. 514 // 515 // We want to do this as early as possible, because most of the code 516 // in CockroachDB may use logging, and until logging has been 517 // initialized log files will be created in $TMPDIR instead of their 518 // expected location. 519 // 520 // This initialization uses the various configuration parameters 521 // initialized by flag handling (before runStart was called). Any 522 // additional server configuration tweaks for the startup process 523 // must be necessarily non-logging-related, as logging parameters 524 // cannot be picked up beyond this point. 525 stopper, err := setupAndInitializeLoggingAndProfiling(ctx, cmd) 526 if err != nil { 527 return err 528 } 529 530 // If any store has something to say against a server start-up 531 // (e.g. previously detected corruption), listen to them now. 532 if err := serverCfg.Stores.PriorCriticalAlertError(); err != nil { 533 return err 534 } 535 536 // We don't care about GRPCs fairly verbose logs in most client commands, 537 // but when actually starting a server, we enable them. 538 grpcutil.SetSeverity(log.Severity_WARNING) 539 540 // Check the --join flag. 541 if !flagSetForCmd(cmd).Lookup(cliflags.Join.Name).Changed { 542 log.Shout(ctx, log.Severity_WARNING, 543 "running 'cockroach start' without --join is deprecated.\n"+ 544 "Consider using 'cockroach start-single-node' or 'cockroach init' instead.") 545 } 546 547 // Now perform additional configuration tweaks specific to the start 548 // command. 549 550 // Derive temporary/auxiliary directory specifications. 551 if serverCfg.Settings.ExternalIODir, err = initExternalIODir(ctx, serverCfg.Stores.Specs[0]); err != nil { 552 return err 553 } 554 555 // Build a minimal StorageConfig out of the first store's spec, with enough 556 // attributes to be able to read encrypted-at-rest store directories. 557 firstSpec := serverCfg.Stores.Specs[0] 558 firstStoreConfig := base.StorageConfig{ 559 Attrs: firstSpec.Attributes, 560 Dir: firstSpec.Path, 561 Settings: serverCfg.Settings, 562 UseFileRegistry: firstSpec.UseFileRegistry, 563 ExtraOptions: firstSpec.ExtraOptions, 564 } 565 // If the storage engine is set to "default", check the engine type used in 566 // this store directory in a past run. If this check fails for any reason, 567 // use Pebble as the default engine type. 568 serverCfg.StorageEngine = resolveStorageEngineType(ctx, serverCfg.StorageEngine, firstStoreConfig) 569 570 // Next we initialize the target directory for temporary storage. 571 // If encryption at rest is enabled in any fashion, we'll want temp 572 // storage to be encrypted too. To achieve this, we use 573 // the first encrypted store as temp dir target, if any. 574 // If we can't find one, we use the first StoreSpec in the list. 575 var specIdx = 0 576 for i := range serverCfg.Stores.Specs { 577 if serverCfg.Stores.Specs[i].ExtraOptions != nil { 578 specIdx = i 579 } 580 } 581 582 if serverCfg.TempStorageConfig, err = initTempStorageConfig( 583 ctx, serverCfg.Settings, stopper, serverCfg.Stores.Specs[specIdx], 584 ); err != nil { 585 return err 586 } 587 588 // Initialize the node's configuration from startup parameters. 589 // This also reads the part of the configuration that comes from 590 // environment variables. 591 if err := serverCfg.InitNode(ctx); err != nil { 592 return errors.Wrap(err, "failed to initialize node") 593 } 594 595 // The configuration is now ready to report to the user and the log 596 // file. We had to wait after InitNode() so that all configuration 597 // environment variables, which are reported too, have been read and 598 // registered. 599 reportConfiguration(ctx) 600 601 // Until/unless CockroachDB embeds its own tz database, we want 602 // an early sanity check. It's better to inform the user early 603 // than to get surprising errors during SQL queries. 604 if err := checkTzDatabaseAvailability(ctx); err != nil { 605 return errors.Wrap(err, "failed to initialize node") 606 } 607 608 // ReadyFn will be called when the server has started listening on 609 // its network sockets, but perhaps before it has done bootstrapping 610 // and thus before Start() completes. 611 serverCfg.ReadyFn = func(waitForInit bool) { 612 // Inform the user if the network settings are suspicious. We need 613 // to do that after starting to listen because we need to know 614 // which advertise address NewServer() has decided. 615 hintServerCmdFlags(ctx, cmd) 616 617 // If another process was waiting on the PID (e.g. using a FIFO), 618 // this is when we can tell them the node has started listening. 619 if startCtx.pidFile != "" { 620 log.Infof(ctx, "PID file: %s", startCtx.pidFile) 621 if err := ioutil.WriteFile(startCtx.pidFile, []byte(fmt.Sprintf("%d\n", os.Getpid())), 0644); err != nil { 622 log.Errorf(ctx, "failed writing the PID: %v", err) 623 } 624 } 625 626 // If the invoker has requested an URL update, do it now that 627 // the server is ready to accept SQL connections. 628 // (Note: as stated above, ReadyFn is called after the server 629 // has started listening on its socket, but possibly before 630 // the cluster has been initialized and can start processing requests. 631 // This is OK for SQL clients, as the connection will be accepted 632 // by the network listener and will just wait/suspend until 633 // the cluster initializes, at which point it will be picked up 634 // and let the client go through, transparently.) 635 if startCtx.listeningURLFile != "" { 636 log.Infof(ctx, "listening URL file: %s", startCtx.listeningURLFile) 637 // (Re-)compute the client connection URL. We cannot do this 638 // earlier (e.g. above, in the runStart function) because 639 // at this time the address and port have not been resolved yet. 640 pgURL, err := serverCfg.PGURL(url.User(security.RootUser)) 641 if err != nil { 642 log.Errorf(ctx, "failed computing the URL: %v", err) 643 return 644 } 645 646 if err = ioutil.WriteFile(startCtx.listeningURLFile, []byte(fmt.Sprintf("%s\n", pgURL)), 0644); err != nil { 647 log.Errorf(ctx, "failed writing the URL: %v", err) 648 } 649 } 650 651 if waitForInit { 652 log.Shout(ctx, log.Severity_INFO, 653 "initial startup completed.\n"+ 654 "Node will now attempt to join a running cluster, or wait for `cockroach init`.\n"+ 655 "Client connections will be accepted after this completes successfully.\n"+ 656 "Check the log file(s) for progress. ") 657 } 658 659 // Ensure the configuration logging is written to disk in case a 660 // process is waiting for the sdnotify readiness to read important 661 // information from there. 662 log.Flush() 663 664 // Signal readiness. This unblocks the process when running with 665 // --background or under systemd. 666 if err := sdnotify.Ready(); err != nil { 667 log.Errorf(ctx, "failed to signal readiness using systemd protocol: %s", err) 668 } 669 } 670 671 // DelayedBoostrapFn will be called if the boostrap process is 672 // taking a bit long. 673 serverCfg.DelayedBootstrapFn = func() { 674 const msg = `The server appears to be unable to contact the other nodes in the cluster. Please try: 675 676 - starting the other nodes, if you haven't already; 677 - double-checking that the '--join' and '--listen'/'--advertise' flags are set up correctly; 678 - running the 'cockroach init' command if you are trying to initialize a new cluster. 679 680 If problems persist, please see %s.` 681 docLink := base.DocsURL("cluster-setup-troubleshooting.html") 682 if !startCtx.inBackground { 683 log.Shoutf(context.Background(), log.Severity_WARNING, msg, docLink) 684 } else { 685 // Don't shout to stderr since the server will have detached by 686 // the time this function gets called. 687 log.Warningf(ctx, msg, docLink) 688 } 689 } 690 691 // Set up the Geospatial library. 692 // We need to make sure this happens before any queries involving geospatial data is executed. 693 loc, err := geos.EnsureInit(geos.EnsureInitErrorDisplayPrivate, demoCtx.geoLibsDir) 694 if err != nil { 695 log.Infof(ctx, "could not initialize GEOS - geospatial functions may not be available: %v", err) 696 } else { 697 log.Infof(ctx, "GEOS initialized at %s", loc) 698 } 699 700 // Beyond this point, the configuration is set and the server is 701 // ready to start. 702 log.Info(ctx, "starting cockroach node") 703 704 // Run the rest of the startup process in a goroutine separate from 705 // the main goroutine to avoid preventing proper handling of signals 706 // if we get stuck on something during initialization (#10138). 707 var serverStatusMu struct { 708 syncutil.Mutex 709 // Used to synchronize server startup with server shutdown if something 710 // interrupts the process during initialization (it isn't safe to try to 711 // drain a server that doesn't exist or is in the middle of starting up, 712 // or to start a server after draining has begun). 713 started, draining bool 714 } 715 var s *server.Server 716 errChan := make(chan error, 1) 717 go func() { 718 // Ensure that the log files see the startup messages immediately. 719 defer log.Flush() 720 // If anything goes dramatically wrong, use Go's panic/recover 721 // mechanism to intercept the panic and log the panic details to 722 // the error reporting server. 723 defer func() { 724 if s != nil { 725 // We only attempt to log the panic details if the server has 726 // actually been started successfully. If there's no server, 727 // we won't know enough to decide whether reporting is 728 // permitted. 729 log.RecoverAndReportPanic(ctx, &s.ClusterSettings().SV) 730 } 731 }() 732 // When the start up goroutine completes, so can the start up span 733 // defined above. 734 defer sp.Finish() 735 736 // Any error beyond this point should be reported through the 737 // errChan defined above. However, in Go the code pattern "if err 738 // != nil { return err }" is more common. Expecting contributors 739 // to remember to write "if err != nil { errChan <- err }" beyond 740 // this point is optimistic. To avoid any error, we capture all 741 // the error returns in a closure, and do the errChan reporting, 742 // if needed, when that function returns. 743 if err := func() error { 744 // Instantiate the server. 745 var err error 746 s, err = server.NewServer(serverCfg, stopper) 747 if err != nil { 748 return errors.Wrap(err, "failed to start server") 749 } 750 751 // Have we already received a signal to terminate? If so, just 752 // stop here. 753 serverStatusMu.Lock() 754 draining := serverStatusMu.draining 755 serverStatusMu.Unlock() 756 if draining { 757 return nil 758 } 759 760 // Attempt to start the server. 761 if err := s.Start(ctx); err != nil { 762 if le := (*server.ListenError)(nil); errors.As(err, &le) { 763 const errorPrefix = "consider changing the port via --%s" 764 if le.Addr == serverCfg.Addr { 765 err = errors.Wrapf(err, errorPrefix, cliflags.ListenAddr.Name) 766 } else if le.Addr == serverCfg.HTTPAddr { 767 err = errors.Wrapf(err, errorPrefix, cliflags.ListenHTTPAddr.Name) 768 } 769 } 770 771 return errors.Wrap(err, "cockroach server exited with error") 772 } 773 // Server started, notify the shutdown monitor running concurrently. 774 serverStatusMu.Lock() 775 serverStatusMu.started = true 776 serverStatusMu.Unlock() 777 778 // Start up the update check loop. 779 // We don't do this in (*server.Server).Start() because we don't want it 780 // in tests. 781 if !cluster.TelemetryOptOut() { 782 s.PeriodicallyCheckForUpdates(ctx) 783 } 784 785 initialBoot := s.InitialBoot() 786 787 if disableReplication && initialBoot { 788 // For start-single-node, set the default replication factor to 789 // 1 so as to avoid warning message and unnecessary rebalance 790 // churn. 791 if err := cliDisableReplication(ctx, s); err != nil { 792 log.Errorf(ctx, "could not disable replication: %v", err) 793 return err 794 } 795 log.Shout(ctx, log.Severity_INFO, 796 "Replication was disabled for this cluster.\n"+ 797 "When/if adding nodes in the future, update zone configurations to increase the replication factor.") 798 } 799 800 // Now inform the user that the server is running and tell the 801 // user about its run-time derived parameters. 802 var buf bytes.Buffer 803 info := build.GetInfo() 804 tw := tabwriter.NewWriter(&buf, 2, 1, 2, ' ', 0) 805 fmt.Fprintf(tw, "CockroachDB node starting at %s (took %0.1fs)\n", timeutil.Now(), timeutil.Since(tBegin).Seconds()) 806 fmt.Fprintf(tw, "build:\t%s %s @ %s (%s)\n", info.Distribution, info.Tag, info.Time, info.GoVersion) 807 fmt.Fprintf(tw, "webui:\t%s\n", serverCfg.AdminURL()) 808 809 // (Re-)compute the client connection URL. We cannot do this 810 // earlier (e.g. above, in the runStart function) because 811 // at this time the address and port have not been resolved yet. 812 pgURL, err := serverCfg.PGURL(url.User(security.RootUser)) 813 if err != nil { 814 log.Errorf(ctx, "failed computing the URL: %v", err) 815 return err 816 } 817 fmt.Fprintf(tw, "sql:\t%s\n", pgURL) 818 819 fmt.Fprintf(tw, "RPC client flags:\t%s\n", clientFlagsRPC()) 820 if len(serverCfg.SocketFile) != 0 { 821 fmt.Fprintf(tw, "socket:\t%s\n", serverCfg.SocketFile) 822 } 823 fmt.Fprintf(tw, "logs:\t%s\n", flag.Lookup("log-dir").Value) 824 if serverCfg.AuditLogDirName.IsSet() { 825 fmt.Fprintf(tw, "SQL audit logs:\t%s\n", serverCfg.AuditLogDirName) 826 } 827 if serverCfg.Attrs != "" { 828 fmt.Fprintf(tw, "attrs:\t%s\n", serverCfg.Attrs) 829 } 830 if len(serverCfg.Locality.Tiers) > 0 { 831 fmt.Fprintf(tw, "locality:\t%s\n", serverCfg.Locality) 832 } 833 if s.TempDir() != "" { 834 fmt.Fprintf(tw, "temp dir:\t%s\n", s.TempDir()) 835 } 836 if ext := s.ClusterSettings().ExternalIODir; ext != "" { 837 fmt.Fprintf(tw, "external I/O path: \t%s\n", ext) 838 } else { 839 fmt.Fprintf(tw, "external I/O path: \t<disabled>\n") 840 } 841 for i, spec := range serverCfg.Stores.Specs { 842 fmt.Fprintf(tw, "store[%d]:\t%s\n", i, spec) 843 } 844 fmt.Fprintf(tw, "storage engine: \t%s\n", serverCfg.StorageEngine.String()) 845 nodeID := s.NodeID() 846 if initialBoot { 847 if nodeID == server.FirstNodeID { 848 fmt.Fprintf(tw, "status:\tinitialized new cluster\n") 849 } else { 850 fmt.Fprintf(tw, "status:\tinitialized new node, joined pre-existing cluster\n") 851 } 852 } else { 853 fmt.Fprintf(tw, "status:\trestarted pre-existing node\n") 854 } 855 856 if baseCfg.ClusterName != "" { 857 fmt.Fprintf(tw, "cluster name:\t%s\n", baseCfg.ClusterName) 858 } 859 860 // Remember the cluster ID for log file rotation. 861 clusterID := s.ClusterID().String() 862 log.SetClusterID(clusterID) 863 fmt.Fprintf(tw, "clusterID:\t%s\n", clusterID) 864 fmt.Fprintf(tw, "nodeID:\t%d\n", nodeID) 865 866 // Collect the formatted string and show it to the user. 867 if err := tw.Flush(); err != nil { 868 return err 869 } 870 msg := buf.String() 871 log.Infof(ctx, "node startup completed:\n%s", msg) 872 if !startCtx.inBackground && !log.LoggingToStderr(log.Severity_INFO) { 873 fmt.Print(msg) 874 } 875 876 return nil 877 }(); err != nil { 878 errChan <- err 879 } 880 }() 881 882 // The remainder of the main function executes concurrently with the 883 // start up goroutine started above. 884 // 885 // It is concerned with determining when the server should stop 886 // because the main process is being shut down -- either via a stop 887 // message received from `cockroach quit` / `cockroach 888 // decommission`, or a signal. 889 890 // We'll want to log any shutdown activity against a separate span. 891 shutdownSpan := tracer.StartSpan("server shutdown") 892 defer shutdownSpan.Finish() 893 shutdownCtx := opentracing.ContextWithSpan(context.Background(), shutdownSpan) 894 895 // returnErr will be populated with the error to use to exit the 896 // process (reported to the shell). 897 var returnErr error 898 899 stopWithoutDrain := make(chan struct{}) // closed if interrupted very early 900 901 // Block until one of the signals above is received or the stopper 902 // is stopped externally (for example, via the quit endpoint). 903 select { 904 case err := <-errChan: 905 // SetSync both flushes and ensures that subsequent log writes are flushed too. 906 log.SetSync(true) 907 return err 908 909 case <-stopper.ShouldStop(): 910 // Server is being stopped externally and our job is finished 911 // here since we don't know if it's a graceful shutdown or not. 912 <-stopper.IsStopped() 913 // SetSync both flushes and ensures that subsequent log writes are flushed too. 914 log.SetSync(true) 915 return nil 916 917 case sig := <-signalCh: 918 // We start synchronizing log writes from here, because if a 919 // signal was received there is a non-zero chance the sender of 920 // this signal will follow up with SIGKILL if the shutdown is not 921 // timely, and we don't want logs to be lost. 922 log.SetSync(true) 923 924 log.Infof(shutdownCtx, "received signal '%s'", sig) 925 switch sig { 926 case os.Interrupt: 927 // Graceful shutdown after an interrupt should cause the process 928 // to terminate with a non-zero exit code; however SIGTERM is 929 // "legitimate" and should be acknowledged with a success exit 930 // code. So we keep the error state here for later. 931 returnErr = &cliError{ 932 exitCode: 1, 933 // INFO because a single interrupt is rather innocuous. 934 severity: log.Severity_INFO, 935 cause: errors.New("interrupted"), 936 } 937 msgDouble := "Note: a second interrupt will skip graceful shutdown and terminate forcefully" 938 fmt.Fprintln(os.Stdout, msgDouble) 939 940 case quitSignal: 941 log.DumpStacks(shutdownCtx) 942 } 943 944 // Start the draining process in a separate goroutine so that it 945 // runs concurrently with the timeout check below. 946 go func() { 947 serverStatusMu.Lock() 948 serverStatusMu.draining = true 949 drainingIsSafe := serverStatusMu.started 950 serverStatusMu.Unlock() 951 952 // drainingIsSafe may have been set in the meantime, but that's ok. 953 // In the worst case, we're not draining a Server that has *just* 954 // started. Not desirable, but not terrible either. 955 if !drainingIsSafe { 956 close(stopWithoutDrain) 957 return 958 } 959 // Don't use shutdownCtx because this is in a goroutine that may 960 // still be running after shutdownCtx's span has been finished. 961 ac := log.AmbientContext{} 962 ac.AddLogTag("server drain process", nil) 963 drainCtx := ac.AnnotateCtx(context.Background()) 964 965 // Perform a graceful drain. We keep retrying forever, in 966 // case there are many range leases or some unavailability 967 // preventing progress. If the operator wants to expedite 968 // the shutdown, they will need to make it ungraceful 969 // via a 2nd signal. 970 for { 971 remaining, _, err := s.Drain(drainCtx) 972 if err != nil { 973 log.Errorf(drainCtx, "graceful drain failed: %v", err) 974 break 975 } 976 if remaining == 0 { 977 // No more work to do. 978 break 979 } 980 // Avoid a busy wait with high CPU usage if the server replies 981 // with an incomplete drain too quickly. 982 time.Sleep(200 * time.Millisecond) 983 } 984 985 stopper.Stop(drainCtx) 986 }() 987 988 // Don't return: we're shutting down gracefully. 989 990 case <-log.FatalChan(): 991 // A fatal error has occurred. Stop everything (gracelessly) to 992 // avoid serving incorrect data while the final log messages are 993 // being written. 994 // https://github.com/cockroachdb/cockroach/issues/23414 995 // TODO(bdarnell): This could be more graceless, for example by 996 // reaching into the server objects and closing all the 997 // connections while they're in use. That would be more in line 998 // with the expected effect of a log.Fatal. 999 stopper.Stop(shutdownCtx) 1000 // The logging goroutine is now responsible for killing this 1001 // process, so just block this goroutine. 1002 select {} 1003 } 1004 1005 // At this point, a signal has been received to shut down the 1006 // process, and a goroutine is busy telling the server to drain and 1007 // stop. From this point on, we just have to wait until the server 1008 // indicates it has stopped. 1009 1010 const msgDrain = "initiating graceful shutdown of server" 1011 log.Info(shutdownCtx, msgDrain) 1012 fmt.Fprintln(os.Stdout, msgDrain) 1013 1014 // Notify the user every 5 second of the shutdown progress. 1015 go func() { 1016 ticker := time.NewTicker(5 * time.Second) 1017 defer ticker.Stop() 1018 for { 1019 select { 1020 case <-ticker.C: 1021 log.Infof(context.Background(), "%d running tasks", stopper.NumTasks()) 1022 case <-stopper.ShouldStop(): 1023 return 1024 case <-stopWithoutDrain: 1025 return 1026 } 1027 } 1028 }() 1029 1030 // Meanwhile, we don't want to wait too long either, in case the 1031 // server is getting stuck and doesn't shut down in a timely manner. 1032 // 1033 // So we also pay attention to any additional signal received beyond 1034 // this point (maybe some service monitor was impatient and sends 1035 // another signal to hasten the shutdown process). 1036 // 1037 // If any such trigger to hasten occurs, we simply return, which 1038 // will cause the process to exit and the server goroutines to be 1039 // forcefully terminated. 1040 1041 const hardShutdownHint = " - node may take longer to restart & clients may need to wait for leases to expire" 1042 select { 1043 case sig := <-signalCh: 1044 // This new signal is not welcome, as it interferes with the graceful 1045 // shutdown process. 1046 log.Shoutf(shutdownCtx, log.Severity_ERROR, 1047 "received signal '%s' during shutdown, initiating hard shutdown%s", 1048 log.Safe(sig), log.Safe(hardShutdownHint)) 1049 handleSignalDuringShutdown(sig) 1050 panic("unreachable") 1051 1052 case <-stopper.IsStopped(): 1053 const msgDone = "server drained and shutdown completed" 1054 log.Infof(shutdownCtx, msgDone) 1055 fmt.Fprintln(os.Stdout, msgDone) 1056 1057 case <-stopWithoutDrain: 1058 const msgDone = "too early to drain; used hard shutdown instead" 1059 log.Infof(shutdownCtx, msgDone) 1060 fmt.Fprintln(os.Stdout, msgDone) 1061 } 1062 1063 return returnErr 1064 } 1065 1066 func hintServerCmdFlags(ctx context.Context, cmd *cobra.Command) { 1067 pf := flagSetForCmd(cmd) 1068 1069 listenAddrSpecified := pf.Lookup(cliflags.ListenAddr.Name).Changed || pf.Lookup(cliflags.ServerHost.Name).Changed 1070 advAddrSpecified := pf.Lookup(cliflags.AdvertiseAddr.Name).Changed || pf.Lookup(cliflags.AdvertiseHost.Name).Changed 1071 1072 if !listenAddrSpecified && !advAddrSpecified { 1073 host, _, _ := net.SplitHostPort(serverCfg.AdvertiseAddr) 1074 log.Shoutf(ctx, log.Severity_WARNING, 1075 "neither --listen-addr nor --advertise-addr was specified.\n"+ 1076 "The server will advertise %q to other nodes, is this routable?\n\n"+ 1077 "Consider using:\n"+ 1078 "- for local-only servers: --listen-addr=localhost\n"+ 1079 "- for multi-node clusters: --advertise-addr=<host/IP addr>\n", host) 1080 } 1081 } 1082 1083 func clientFlagsRPC() string { 1084 flags := []string{os.Args[0], "<client cmd>"} 1085 if serverCfg.AdvertiseAddr != "" { 1086 flags = append(flags, "--host="+serverCfg.AdvertiseAddr) 1087 } 1088 if startCtx.serverInsecure { 1089 flags = append(flags, "--insecure") 1090 } else { 1091 flags = append(flags, "--certs-dir="+startCtx.serverSSLCertsDir) 1092 } 1093 return strings.Join(flags, " ") 1094 } 1095 1096 func checkTzDatabaseAvailability(ctx context.Context) error { 1097 if _, err := timeutil.LoadLocation("America/New_York"); err != nil { 1098 log.Errorf(ctx, "timeutil.LoadLocation: %v", err) 1099 reportedErr := errors.WithHint( 1100 errors.WithIssueLink( 1101 errors.New("unable to load named timezones"), 1102 errors.IssueLink{IssueURL: unimplemented.MakeURL(36864)}), 1103 "Check that the time zone database is installed on your system, or\n"+ 1104 "set the ZONEINFO environment variable to a Go time zone .zip archive.") 1105 1106 if envutil.EnvOrDefaultBool("COCKROACH_INCONSISTENT_TIME_ZONES", false) { 1107 // The user tells us they really know what they want. 1108 reportedErr := &formattedError{err: reportedErr} 1109 log.Shoutf(ctx, log.Severity_WARNING, "%v", reportedErr) 1110 } else { 1111 // Prevent a successful start. 1112 // 1113 // In the past, we were simply using log.Shout to emit an error, 1114 // informing the user that startup could continue with degraded 1115 // behavior. However, usage demonstrated that users typically do 1116 // not see the error and instead run into silently incorrect SQL 1117 // results. To avoid this situation altogether, it's better to 1118 // stop early. 1119 return reportedErr 1120 } 1121 } 1122 return nil 1123 } 1124 1125 func reportConfiguration(ctx context.Context) { 1126 serverCfg.Report(ctx) 1127 if envVarsUsed := envutil.GetEnvVarsUsed(); len(envVarsUsed) > 0 { 1128 log.Infof(ctx, "using local environment variables: %s", strings.Join(envVarsUsed, ", ")) 1129 } 1130 // If a user ever reports "bad things have happened", any 1131 // troubleshooting steps will want to rule out that the user was 1132 // running as root in a multi-user environment, or using different 1133 // uid/gid across runs in the same data directory. To determine 1134 // this, it's easier if the information appears in the log file. 1135 log.Infof(ctx, "process identity: %s", sysutil.ProcessIdentity()) 1136 } 1137 1138 func maybeWarnMemorySizes(ctx context.Context) { 1139 // Is the cache configuration OK? 1140 if !cacheSizeValue.IsSet() { 1141 var buf bytes.Buffer 1142 fmt.Fprintf(&buf, "Using the default setting for --cache (%s).\n", cacheSizeValue) 1143 fmt.Fprintf(&buf, " A significantly larger value is usually needed for good performance.\n") 1144 if size, err := status.GetTotalMemory(context.Background()); err == nil { 1145 fmt.Fprintf(&buf, " If you have a dedicated server a reasonable setting is --cache=.25 (%s).", 1146 humanizeutil.IBytes(size/4)) 1147 } else { 1148 fmt.Fprintf(&buf, " If you have a dedicated server a reasonable setting is 25%% of physical memory.") 1149 } 1150 log.Warningf(ctx, "%s", buf.String()) 1151 } 1152 1153 // Check that the total suggested "max" memory is well below the available memory. 1154 if maxMemory, err := status.GetTotalMemory(ctx); err == nil { 1155 requestedMem := serverCfg.CacheSize + serverCfg.MemoryPoolSize 1156 maxRecommendedMem := int64(.75 * float64(maxMemory)) 1157 if requestedMem > maxRecommendedMem { 1158 log.Shoutf(ctx, log.Severity_WARNING, 1159 "the sum of --max-sql-memory (%s) and --cache (%s) is larger than 75%% of total RAM (%s).\nThis server is running at increased risk of memory-related failures.", 1160 sqlSizeValue, cacheSizeValue, humanizeutil.IBytes(maxRecommendedMem)) 1161 } 1162 } 1163 } 1164 1165 func logOutputDirectory() string { 1166 return startCtx.logDir.String() 1167 } 1168 1169 // setupAndInitializeLoggingAndProfiling does what it says on the label. 1170 // Prior to this however it determines suitable defaults for the 1171 // logging output directory and the verbosity level of stderr logging. 1172 // We only do this for the "start" command which is why this work 1173 // occurs here and not in an OnInitialize function. 1174 func setupAndInitializeLoggingAndProfiling( 1175 ctx context.Context, cmd *cobra.Command, 1176 ) (stopper *stop.Stopper, err error) { 1177 // Default the log directory to the "logs" subdirectory of the first 1178 // non-memory store. If more than one non-memory stores is detected, 1179 // print a warning. 1180 ambiguousLogDirs := false 1181 lf := cmd.Flags().Lookup(logflags.LogDirName) 1182 if !startCtx.logDir.IsSet() && !lf.Changed { 1183 // We only override the log directory if the user has not explicitly 1184 // disabled file logging using --log-dir="". 1185 newDir := "" 1186 for _, spec := range serverCfg.Stores.Specs { 1187 if spec.InMemory { 1188 continue 1189 } 1190 if newDir != "" { 1191 ambiguousLogDirs = true 1192 break 1193 } 1194 newDir = filepath.Join(spec.Path, "logs") 1195 } 1196 if err := startCtx.logDir.Set(newDir); err != nil { 1197 return nil, err 1198 } 1199 } 1200 1201 if logDir := startCtx.logDir.String(); logDir != "" { 1202 ls := cockroachCmd.PersistentFlags().Lookup(logflags.LogToStderrName) 1203 if !ls.Changed { 1204 // Unless the settings were overridden by the user, silence 1205 // logging to stderr because the messages will go to a log file. 1206 if err := ls.Value.Set(log.Severity_NONE.String()); err != nil { 1207 return nil, err 1208 } 1209 } 1210 1211 // Make sure the path exists. 1212 if err := os.MkdirAll(logDir, 0755); err != nil { 1213 return nil, errors.Wrap(err, "unable to create log directory") 1214 } 1215 1216 // Note that we configured the --log-dir flag to set 1217 // startContext.logDir. This is the point at which we set log-dir for the 1218 // util/log package. We don't want to set it earlier to avoid spuriously 1219 // creating a file in an incorrect log directory or if something is 1220 // accidentally logging after flag parsing but before the --background 1221 // dispatch has occurred. 1222 if err := flag.Lookup(logflags.LogDirName).Value.Set(logDir); err != nil { 1223 return nil, err 1224 } 1225 1226 // NB: this message is a crutch until #33458 is addressed. Without it, 1227 // the calls to log.Shout below can be the first use of logging, hitting 1228 // the bug described in the issue. 1229 log.Infof(ctx, "logging to directory %s", logDir) 1230 1231 // Start the log file GC daemon to remove files that make the log 1232 // directory too large. 1233 log.StartGCDaemon(ctx) 1234 1235 defer func() { 1236 if stopper != nil { 1237 // When the function complete successfully, start the loggers 1238 // for the storage engines. We need to do this at the end 1239 // because we need to register the loggers. 1240 stopper.AddCloser(storage.InitPebbleLogger(ctx)) 1241 stopper.AddCloser(storage.InitRocksDBLogger(ctx)) 1242 } 1243 }() 1244 } 1245 1246 // We want to be careful to still produce useful debug dumps if the 1247 // server configuration has disabled logging to files. 1248 outputDirectory := "." 1249 if p := logOutputDirectory(); p != "" { 1250 outputDirectory = p 1251 } 1252 serverCfg.GoroutineDumpDirName = filepath.Join(outputDirectory, base.GoroutineDumpDir) 1253 serverCfg.HeapProfileDirName = filepath.Join(outputDirectory, base.HeapProfileDir) 1254 1255 if ambiguousLogDirs { 1256 // Note that we can't report this message earlier, because the log directory 1257 // may not have been ready before the call to MkdirAll() above. 1258 log.Shout(ctx, log.Severity_WARNING, "multiple stores configured"+ 1259 " and --log-dir not specified, you may want to specify --log-dir to disambiguate.") 1260 } 1261 1262 if auditLogDir := serverCfg.AuditLogDirName.String(); auditLogDir != "" && auditLogDir != outputDirectory { 1263 // Make sure the path for the audit log exists, if it's a different path than 1264 // the main log. 1265 if err := os.MkdirAll(auditLogDir, 0755); err != nil { 1266 return nil, err 1267 } 1268 log.Eventf(ctx, "created SQL audit log directory %s", auditLogDir) 1269 } 1270 1271 if startCtx.serverInsecure { 1272 // Use a non-annotated context here since the annotation just looks funny, 1273 // particularly to new users (made worse by it always printing as [n?]). 1274 addr := startCtx.serverListenAddr 1275 if addr == "" { 1276 addr = "<all your IP addresses>" 1277 } 1278 log.Shoutf(context.Background(), log.Severity_WARNING, 1279 "RUNNING IN INSECURE MODE!\n\n"+ 1280 "- Your cluster is open for any client that can access %s.\n"+ 1281 "- Any user, even root, can log in without providing a password.\n"+ 1282 "- Any user, connecting as root, can read or write any data in your cluster.\n"+ 1283 "- There is no network encryption nor authentication, and thus no confidentiality.\n\n"+ 1284 "Check out how to secure your cluster: %s", 1285 addr, log.Safe(base.DocsURL("secure-a-cluster.html"))) 1286 } 1287 1288 maybeWarnMemorySizes(ctx) 1289 1290 // We log build information to stdout (for the short summary), but also 1291 // to stderr to coincide with the full logs. 1292 info := build.GetInfo() 1293 log.Infof(ctx, "%s", info.Short()) 1294 1295 initMemProfile(ctx, outputDirectory) 1296 initCPUProfile(ctx, outputDirectory) 1297 initBlockProfile() 1298 initMutexProfile() 1299 1300 // Disable Stopper task tracking as performing that call site tracking is 1301 // moderately expensive (certainly outweighing the infrequent benefit it 1302 // provides). 1303 stopper = stop.NewStopper() 1304 log.Event(ctx, "initialized profiles") 1305 1306 return stopper, nil 1307 } 1308 1309 func addrWithDefaultHost(addr string) (string, error) { 1310 host, port, err := net.SplitHostPort(addr) 1311 if err != nil { 1312 return "", err 1313 } 1314 if host == "" { 1315 host = "localhost" 1316 } 1317 return net.JoinHostPort(host, port), nil 1318 } 1319 1320 // getClientGRPCConn returns a ClientConn, a Clock and a method that blocks 1321 // until the connection (and its associated goroutines) have terminated. 1322 func getClientGRPCConn( 1323 ctx context.Context, cfg server.Config, 1324 ) (*grpc.ClientConn, *hlc.Clock, func(), error) { 1325 if ctx.Done() == nil { 1326 return nil, nil, nil, errors.New("context must be cancellable") 1327 } 1328 // 0 to disable max offset checks; this RPC context is not a member of the 1329 // cluster, so there's no need to enforce that its max offset is the same 1330 // as that of nodes in the cluster. 1331 clock := hlc.NewClock(hlc.UnixNano, 0) 1332 stopper := stop.NewStopper() 1333 rpcContext := rpc.NewContext( 1334 log.AmbientContext{Tracer: cfg.Settings.Tracer}, 1335 cfg.Config, 1336 clock, 1337 stopper, 1338 cfg.Settings, 1339 ) 1340 addr, err := addrWithDefaultHost(cfg.AdvertiseAddr) 1341 if err != nil { 1342 stopper.Stop(ctx) 1343 return nil, nil, nil, err 1344 } 1345 // We use GRPCUnvalidatedDial() here because it does not matter 1346 // to which node we're talking to. 1347 conn, err := rpcContext.GRPCUnvalidatedDial(addr).Connect(ctx) 1348 if err != nil { 1349 stopper.Stop(ctx) 1350 return nil, nil, nil, err 1351 } 1352 stopper.AddCloser(stop.CloserFn(func() { 1353 _ = conn.Close() 1354 })) 1355 1356 // Tie the lifetime of the stopper to that of the context. 1357 closer := func() { 1358 stopper.Stop(ctx) 1359 } 1360 return conn, clock, closer, nil 1361 }