github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/server/server.go (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package server 12 13 import ( 14 "compress/gzip" 15 "context" 16 "crypto/tls" 17 "fmt" 18 "io" 19 "io/ioutil" 20 "net" 21 "net/http" 22 "os" 23 "path/filepath" 24 "reflect" 25 "runtime" 26 "strings" 27 "sync" 28 "sync/atomic" 29 "time" 30 31 "github.com/cockroachdb/cmux" 32 "github.com/cockroachdb/cockroach/pkg/base" 33 "github.com/cockroachdb/cockroach/pkg/blobs" 34 "github.com/cockroachdb/cockroach/pkg/clusterversion" 35 "github.com/cockroachdb/cockroach/pkg/gossip" 36 "github.com/cockroachdb/cockroach/pkg/jobs" 37 "github.com/cockroachdb/cockroach/pkg/jobs/jobsprotectedts" 38 "github.com/cockroachdb/cockroach/pkg/kv" 39 "github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord" 40 "github.com/cockroachdb/cockroach/pkg/kv/kvserver" 41 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/container" 42 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts" 43 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts/ptprovider" 44 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts/ptreconcile" 45 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/reports" 46 "github.com/cockroachdb/cockroach/pkg/roachpb" 47 "github.com/cockroachdb/cockroach/pkg/rpc" 48 "github.com/cockroachdb/cockroach/pkg/rpc/nodedialer" 49 "github.com/cockroachdb/cockroach/pkg/server/debug" 50 "github.com/cockroachdb/cockroach/pkg/server/goroutinedumper" 51 "github.com/cockroachdb/cockroach/pkg/server/heapprofiler" 52 "github.com/cockroachdb/cockroach/pkg/server/serverpb" 53 "github.com/cockroachdb/cockroach/pkg/server/status" 54 "github.com/cockroachdb/cockroach/pkg/server/telemetry" 55 "github.com/cockroachdb/cockroach/pkg/settings" 56 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 57 "github.com/cockroachdb/cockroach/pkg/sql" 58 _ "github.com/cockroachdb/cockroach/pkg/sql/gcjob" // register jobs declared outside of pkg/sql 59 "github.com/cockroachdb/cockroach/pkg/sql/pgwire" 60 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 61 "github.com/cockroachdb/cockroach/pkg/storage" 62 "github.com/cockroachdb/cockroach/pkg/storage/cloud" 63 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 64 "github.com/cockroachdb/cockroach/pkg/ts" 65 "github.com/cockroachdb/cockroach/pkg/ui" 66 "github.com/cockroachdb/cockroach/pkg/util" 67 "github.com/cockroachdb/cockroach/pkg/util/envutil" 68 "github.com/cockroachdb/cockroach/pkg/util/hlc" 69 "github.com/cockroachdb/cockroach/pkg/util/httputil" 70 "github.com/cockroachdb/cockroach/pkg/util/log" 71 "github.com/cockroachdb/cockroach/pkg/util/metric" 72 "github.com/cockroachdb/cockroach/pkg/util/netutil" 73 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 74 "github.com/cockroachdb/cockroach/pkg/util/retry" 75 "github.com/cockroachdb/cockroach/pkg/util/stop" 76 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 77 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 78 "github.com/cockroachdb/cockroach/pkg/util/tracing" 79 "github.com/cockroachdb/cockroach/pkg/util/uuid" 80 "github.com/cockroachdb/errors" 81 "github.com/cockroachdb/logtags" 82 "github.com/cockroachdb/sentry-go" 83 gwruntime "github.com/grpc-ecosystem/grpc-gateway/runtime" 84 "github.com/opentracing/opentracing-go" 85 "google.golang.org/grpc" 86 ) 87 88 var ( 89 // Allocation pool for gzipResponseWriters. 90 gzipResponseWriterPool sync.Pool 91 92 forwardClockJumpCheckEnabled = settings.RegisterPublicBoolSetting( 93 "server.clock.forward_jump_check_enabled", 94 "if enabled, forward clock jumps > max_offset/2 will cause a panic", 95 false, 96 ) 97 98 persistHLCUpperBoundInterval = settings.RegisterPublicDurationSetting( 99 "server.clock.persist_upper_bound_interval", 100 "the interval between persisting the wall time upper bound of the clock. The clock "+ 101 "does not generate a wall time greater than the persisted timestamp and will panic if "+ 102 "it sees a wall time greater than this value. When cockroach starts, it waits for the "+ 103 "wall time to catch-up till this persisted timestamp. This guarantees monotonic wall "+ 104 "time across server restarts. Not setting this or setting a value of 0 disables this "+ 105 "feature.", 106 0, 107 ) 108 ) 109 110 // TODO(peter): Until go1.11, ServeMux.ServeHTTP was not safe to call 111 // concurrently with ServeMux.Handle. So we provide our own wrapper with proper 112 // locking. Slightly less efficient because it locks unnecessarily, but 113 // safe. See TestServeMuxConcurrency. Should remove once we've upgraded to 114 // go1.11. 115 type safeServeMux struct { 116 mu syncutil.RWMutex 117 mux http.ServeMux 118 } 119 120 func (mux *safeServeMux) Handle(pattern string, handler http.Handler) { 121 mux.mu.Lock() 122 mux.mux.Handle(pattern, handler) 123 mux.mu.Unlock() 124 } 125 126 func (mux *safeServeMux) ServeHTTP(w http.ResponseWriter, r *http.Request) { 127 mux.mu.RLock() 128 mux.mux.ServeHTTP(w, r) 129 mux.mu.RUnlock() 130 } 131 132 // Server is the cockroach server node. 133 type Server struct { 134 // The following fields are populated in NewServer. 135 136 nodeIDContainer *base.NodeIDContainer 137 cfg Config 138 st *cluster.Settings 139 mux safeServeMux 140 clock *hlc.Clock 141 rpcContext *rpc.Context 142 // The gRPC server on which the different RPC handlers will be registered. 143 grpc *grpcServer 144 gossip *gossip.Gossip 145 nodeDialer *nodedialer.Dialer 146 nodeLiveness *kvserver.NodeLiveness 147 storePool *kvserver.StorePool 148 tcsFactory *kvcoord.TxnCoordSenderFactory 149 distSender *kvcoord.DistSender 150 db *kv.DB 151 node *Node 152 registry *metric.Registry 153 recorder *status.MetricsRecorder 154 runtime *status.RuntimeStatSampler 155 156 admin *adminServer 157 status *statusServer 158 authentication *authenticationServer 159 tsDB *ts.DB 160 tsServer *ts.Server 161 raftTransport *kvserver.RaftTransport 162 stopper *stop.Stopper 163 164 debug *debug.Server 165 166 replicationReporter *reports.Reporter 167 protectedtsProvider protectedts.Provider 168 protectedtsReconciler *ptreconcile.Reconciler 169 170 sqlServer *sqlServer 171 172 // The following fields are populated at start time, i.e. in `(*Server).Start`. 173 174 startTime time.Time 175 engines Engines 176 } 177 178 // NewServer creates a Server from a server.Config. 179 func NewServer(cfg Config, stopper *stop.Stopper) (*Server, error) { 180 if err := cfg.ValidateAddrs(context.Background()); err != nil { 181 return nil, err 182 } 183 184 st := cfg.Settings 185 186 if cfg.AmbientCtx.Tracer == nil { 187 panic(errors.New("no tracer set in AmbientCtx")) 188 } 189 190 var clock *hlc.Clock 191 if cfg.ClockDevicePath != "" { 192 clockSrc, err := hlc.MakeClockSource(context.Background(), cfg.ClockDevicePath) 193 if err != nil { 194 return nil, errors.Wrap(err, "instantiating clock source") 195 } 196 clock = hlc.NewClock(clockSrc.UnixNano, time.Duration(cfg.MaxOffset)) 197 } else { 198 clock = hlc.NewClock(hlc.UnixNano, time.Duration(cfg.MaxOffset)) 199 } 200 registry := metric.NewRegistry() 201 // If the tracer has a Close function, call it after the server stops. 202 if tr, ok := cfg.AmbientCtx.Tracer.(stop.Closer); ok { 203 stopper.AddCloser(tr) 204 } 205 206 // Attempt to load TLS configs right away, failures are permanent. 207 if !cfg.Insecure { 208 // TODO(peter): Call methods on CertificateManager directly. Need to call 209 // base.wrapError or similar on the resulting error. 210 if _, err := cfg.GetServerTLSConfig(); err != nil { 211 return nil, err 212 } 213 if _, err := cfg.GetUIServerTLSConfig(); err != nil { 214 return nil, err 215 } 216 if _, err := cfg.GetClientTLSConfig(); err != nil { 217 return nil, err 218 } 219 cm, err := cfg.GetCertificateManager() 220 if err != nil { 221 return nil, err 222 } 223 cm.RegisterSignalHandler(stopper) 224 registry.AddMetricStruct(cm.Metrics()) 225 } 226 227 // Add a dynamic log tag value for the node ID. 228 // 229 // We need to pass an ambient context to the various server components, but we 230 // won't know the node ID until we Start(). At that point it's too late to 231 // change the ambient contexts in the components (various background processes 232 // will have already started using them). 233 // 234 // NodeIDContainer allows us to add the log tag to the context now and update 235 // the value asynchronously. It's not significantly more expensive than a 236 // regular tag since it's just doing an (atomic) load when a log/trace message 237 // is constructed. The node ID is set by the Store if this host was 238 // bootstrapped; otherwise a new one is allocated in Node. 239 nodeIDContainer := &base.NodeIDContainer{} 240 cfg.AmbientCtx.AddLogTag("n", nodeIDContainer) 241 const sqlInstanceID = base.SQLInstanceID(0) 242 idContainer := base.NewSQLIDContainer(sqlInstanceID, nodeIDContainer, true /* exposed */) 243 244 ctx := cfg.AmbientCtx.AnnotateCtx(context.Background()) 245 246 // Check the compatibility between the configured addresses and that 247 // provided in certificates. This also logs the certificate 248 // addresses in all cases to aid troubleshooting. 249 // This must be called after the certificate manager was initialized 250 // and after ValidateAddrs(). 251 cfg.CheckCertificateAddrs(ctx) 252 253 var rpcContext *rpc.Context 254 if knobs := cfg.TestingKnobs.Server; knobs != nil { 255 serverKnobs := knobs.(*TestingKnobs) 256 rpcContext = rpc.NewContextWithTestingKnobs( 257 cfg.AmbientCtx, cfg.Config, clock, stopper, cfg.Settings, 258 serverKnobs.ContextTestingKnobs, 259 ) 260 } else { 261 rpcContext = rpc.NewContext(cfg.AmbientCtx, cfg.Config, clock, stopper, 262 cfg.Settings) 263 } 264 rpcContext.HeartbeatCB = func() { 265 if err := rpcContext.RemoteClocks.VerifyClockOffset(ctx); err != nil { 266 log.Fatalf(ctx, "%v", err) 267 } 268 } 269 registry.AddMetricStruct(rpcContext.Metrics()) 270 271 grpcServer := newGRPCServer(rpcContext) 272 273 g := gossip.New( 274 cfg.AmbientCtx, 275 &rpcContext.ClusterID, 276 nodeIDContainer, 277 rpcContext, 278 grpcServer.Server, 279 stopper, 280 registry, 281 cfg.Locality, 282 &cfg.DefaultZoneConfig, 283 ) 284 nodeDialer := nodedialer.New(rpcContext, gossip.AddressResolver(g)) 285 286 runtimeSampler := status.NewRuntimeStatSampler(ctx, clock) 287 registry.AddMetricStruct(runtimeSampler) 288 289 // A custom RetryOptions is created which uses stopper.ShouldQuiesce() as 290 // the Closer. This prevents infinite retry loops from occurring during 291 // graceful server shutdown 292 // 293 // Such a loop occurs when the DistSender attempts a connection to the 294 // local server during shutdown, and receives an internal server error (HTTP 295 // Code 5xx). This is the correct error for a server to return when it is 296 // shutting down, and is normally retryable in a cluster environment. 297 // However, on a single-node setup (such as a test), retries will never 298 // succeed because the only server has been shut down; thus, the 299 // DistSender needs to know that it should not retry in this situation. 300 var clientTestingKnobs kvcoord.ClientTestingKnobs 301 if kvKnobs := cfg.TestingKnobs.KVClient; kvKnobs != nil { 302 clientTestingKnobs = *kvKnobs.(*kvcoord.ClientTestingKnobs) 303 } 304 retryOpts := cfg.RetryOptions 305 if retryOpts == (retry.Options{}) { 306 retryOpts = base.DefaultRetryOptions() 307 } 308 retryOpts.Closer = stopper.ShouldQuiesce() 309 distSenderCfg := kvcoord.DistSenderConfig{ 310 AmbientCtx: cfg.AmbientCtx, 311 Settings: st, 312 Clock: clock, 313 RPCContext: rpcContext, 314 RPCRetryOptions: &retryOpts, 315 TestingKnobs: clientTestingKnobs, 316 NodeDialer: nodeDialer, 317 } 318 distSender := kvcoord.NewDistSender(distSenderCfg, g) 319 registry.AddMetricStruct(distSender.Metrics()) 320 321 txnMetrics := kvcoord.MakeTxnMetrics(cfg.HistogramWindowInterval()) 322 registry.AddMetricStruct(txnMetrics) 323 txnCoordSenderFactoryCfg := kvcoord.TxnCoordSenderFactoryConfig{ 324 AmbientCtx: cfg.AmbientCtx, 325 Settings: st, 326 Clock: clock, 327 Stopper: stopper, 328 Linearizable: cfg.Linearizable, 329 Metrics: txnMetrics, 330 TestingKnobs: clientTestingKnobs, 331 } 332 tcsFactory := kvcoord.NewTxnCoordSenderFactory(txnCoordSenderFactoryCfg, distSender) 333 334 dbCtx := kv.DefaultDBContext() 335 dbCtx.NodeID = idContainer 336 dbCtx.Stopper = stopper 337 db := kv.NewDBWithContext(cfg.AmbientCtx, tcsFactory, clock, dbCtx) 338 339 nlActive, nlRenewal := cfg.NodeLivenessDurations() 340 341 nodeLiveness := kvserver.NewNodeLiveness( 342 cfg.AmbientCtx, 343 clock, 344 db, 345 g, 346 nlActive, 347 nlRenewal, 348 st, 349 cfg.HistogramWindowInterval(), 350 ) 351 registry.AddMetricStruct(nodeLiveness.Metrics()) 352 353 storePool := kvserver.NewStorePool( 354 cfg.AmbientCtx, 355 st, 356 g, 357 clock, 358 nodeLiveness.GetNodeCount, 359 kvserver.MakeStorePoolNodeLivenessFunc(nodeLiveness), 360 /* deterministic */ false, 361 ) 362 363 raftTransport := kvserver.NewRaftTransport( 364 cfg.AmbientCtx, st, nodeDialer, grpcServer.Server, stopper, 365 ) 366 367 tsDB := ts.NewDB(db, cfg.Settings) 368 registry.AddMetricStruct(tsDB.Metrics()) 369 nodeCountFn := func() int64 { 370 return nodeLiveness.Metrics().LiveNodes.Value() 371 } 372 sTS := ts.MakeServer(cfg.AmbientCtx, tsDB, nodeCountFn, cfg.TimeSeriesServerConfig, stopper) 373 374 // The InternalExecutor will be further initialized later, as we create more 375 // of the server's components. There's a circular dependency - many things 376 // need an InternalExecutor, but the InternalExecutor needs an ExecutorConfig, 377 // which in turn needs many things. That's why everybody that needs an 378 // InternalExecutor uses this one instance. 379 internalExecutor := &sql.InternalExecutor{} 380 jobRegistry := &jobs.Registry{} // ditto 381 382 // This function defines how ExternalStorage objects are created. 383 externalStorage := func(ctx context.Context, dest roachpb.ExternalStorage) (cloud.ExternalStorage, error) { 384 return cloud.MakeExternalStorage( 385 ctx, dest, cfg.ExternalIODirConfig, st, 386 blobs.NewBlobClientFactory( 387 nodeIDContainer.Get(), 388 nodeDialer, 389 st.ExternalIODir, 390 ), 391 ) 392 } 393 externalStorageFromURI := func(ctx context.Context, uri string) (cloud.ExternalStorage, error) { 394 return cloud.ExternalStorageFromURI( 395 ctx, uri, cfg.ExternalIODirConfig, st, 396 blobs.NewBlobClientFactory( 397 nodeIDContainer.Get(), 398 nodeDialer, 399 st.ExternalIODir, 400 ), 401 ) 402 } 403 404 protectedtsProvider, err := ptprovider.New(ptprovider.Config{ 405 DB: db, 406 InternalExecutor: internalExecutor, 407 Settings: st, 408 }) 409 if err != nil { 410 return nil, err 411 } 412 413 // Break a circular dependency: we need a Node to make a StoreConfig (for 414 // ClosedTimestamp), but the Node needs a StoreConfig to be made. 415 var lateBoundNode *Node 416 417 storeCfg := kvserver.StoreConfig{ 418 DefaultZoneConfig: &cfg.DefaultZoneConfig, 419 Settings: st, 420 AmbientCtx: cfg.AmbientCtx, 421 RaftConfig: cfg.RaftConfig, 422 Clock: clock, 423 DB: db, 424 Gossip: g, 425 NodeLiveness: nodeLiveness, 426 Transport: raftTransport, 427 NodeDialer: nodeDialer, 428 RPCContext: rpcContext, 429 ScanInterval: cfg.ScanInterval, 430 ScanMinIdleTime: cfg.ScanMinIdleTime, 431 ScanMaxIdleTime: cfg.ScanMaxIdleTime, 432 HistogramWindowInterval: cfg.HistogramWindowInterval(), 433 StorePool: storePool, 434 SQLExecutor: internalExecutor, 435 LogRangeEvents: cfg.EventLogEnabled, 436 RangeDescriptorCache: distSender.RangeDescriptorCache(), 437 TimeSeriesDataStore: tsDB, 438 439 // Initialize the closed timestamp subsystem. Note that it won't 440 // be ready until it is .Start()ed, but the grpc server can be 441 // registered early. 442 ClosedTimestamp: container.NewContainer(container.Config{ 443 Settings: st, 444 Stopper: stopper, 445 Clock: nodeLiveness.AsLiveClock(), 446 // NB: s.node is not defined at this point, but it will be 447 // before this is ever called. 448 Refresh: func(rangeIDs ...roachpb.RangeID) { 449 for _, rangeID := range rangeIDs { 450 repl, _, err := lateBoundNode.stores.GetReplicaForRangeID(rangeID) 451 if err != nil || repl == nil { 452 continue 453 } 454 repl.EmitMLAI() 455 } 456 }, 457 Dialer: nodeDialer.CTDialer(), 458 }), 459 460 EnableEpochRangeLeases: true, 461 ExternalStorage: externalStorage, 462 ExternalStorageFromURI: externalStorageFromURI, 463 ProtectedTimestampCache: protectedtsProvider, 464 } 465 if storeTestingKnobs := cfg.TestingKnobs.Store; storeTestingKnobs != nil { 466 storeCfg.TestingKnobs = *storeTestingKnobs.(*kvserver.StoreTestingKnobs) 467 } 468 469 recorder := status.NewMetricsRecorder(clock, nodeLiveness, rpcContext, g, st) 470 registry.AddMetricStruct(rpcContext.RemoteClocks.Metrics()) 471 472 node := NewNode( 473 storeCfg, recorder, registry, stopper, 474 txnMetrics, nil /* execCfg */, &rpcContext.ClusterID) 475 lateBoundNode = node 476 roachpb.RegisterInternalServer(grpcServer.Server, node) 477 kvserver.RegisterPerReplicaServer(grpcServer.Server, node.perReplicaServer) 478 node.storeCfg.ClosedTimestamp.RegisterClosedTimestampServer(grpcServer.Server) 479 replicationReporter := reports.NewReporter( 480 db, node.stores, storePool, st, nodeLiveness, internalExecutor) 481 482 protectedtsReconciler := ptreconcile.NewReconciler(ptreconcile.Config{ 483 Settings: st, 484 Stores: node.stores, 485 DB: db, 486 Storage: protectedtsProvider, 487 Cache: protectedtsProvider, 488 StatusFuncs: ptreconcile.StatusFuncs{ 489 jobsprotectedts.MetaType: jobsprotectedts.MakeStatusFunc(jobRegistry), 490 }, 491 }) 492 registry.AddMetricStruct(protectedtsReconciler.Metrics()) 493 494 lateBoundServer := &Server{} 495 // TODO(tbg): give adminServer only what it needs (and avoid circular deps). 496 sAdmin := newAdminServer(lateBoundServer) 497 sessionRegistry := sql.NewSessionRegistry() 498 499 sStatus := newStatusServer( 500 cfg.AmbientCtx, 501 st, 502 cfg.Config, 503 sAdmin, 504 db, 505 g, 506 recorder, 507 nodeLiveness, 508 storePool, 509 rpcContext, 510 node.stores, 511 stopper, 512 sessionRegistry, 513 internalExecutor, 514 ) 515 // TODO(tbg): don't pass all of Server into this to avoid this hack. 516 sAuth := newAuthenticationServer(lateBoundServer) 517 for i, gw := range []grpcGatewayServer{sAdmin, sStatus, sAuth, &sTS} { 518 if reflect.ValueOf(gw).IsNil() { 519 return nil, errors.Errorf("%d: nil", i) 520 } 521 gw.RegisterService(grpcServer.Server) 522 } 523 524 var jobAdoptionStopFile string 525 for _, spec := range cfg.Stores.Specs { 526 if !spec.InMemory && spec.Path != "" { 527 jobAdoptionStopFile = filepath.Join(spec.Path, jobs.PreventAdoptionFile) 528 break 529 } 530 } 531 532 sqlServer, err := newSQLServer(ctx, sqlServerArgs{ 533 sqlServerOptionalArgs: sqlServerOptionalArgs{ 534 rpcContext: rpcContext, 535 distSender: distSender, 536 statusServer: serverpb.MakeOptionalStatusServer(sStatus), 537 nodeLiveness: sqlbase.MakeOptionalNodeLiveness(nodeLiveness), 538 gossip: gossip.MakeExposedGossip(g), 539 nodeDialer: nodeDialer, 540 grpcServer: grpcServer.Server, 541 recorder: recorder, 542 nodeIDContainer: idContainer, 543 externalStorage: externalStorage, 544 externalStorageFromURI: externalStorageFromURI, 545 isMeta1Leaseholder: node.stores.IsMeta1Leaseholder, 546 }, 547 SQLConfig: &cfg.SQLConfig, 548 BaseConfig: &cfg.BaseConfig, 549 stopper: stopper, 550 clock: clock, 551 runtime: runtimeSampler, 552 db: db, 553 registry: registry, 554 sessionRegistry: sessionRegistry, 555 circularInternalExecutor: internalExecutor, 556 circularJobRegistry: jobRegistry, 557 jobAdoptionStopFile: jobAdoptionStopFile, 558 protectedtsProvider: protectedtsProvider, 559 }) 560 if err != nil { 561 return nil, err 562 } 563 sStatus.setStmtDiagnosticsRequester(sqlServer.execCfg.StmtDiagnosticsRecorder) 564 debugServer := debug.NewServer(st, sqlServer.pgServer.HBADebugFn()) 565 node.InitLogger(sqlServer.execCfg) 566 567 *lateBoundServer = Server{ 568 nodeIDContainer: nodeIDContainer, 569 cfg: cfg, 570 st: st, 571 clock: clock, 572 rpcContext: rpcContext, 573 grpc: grpcServer, 574 gossip: g, 575 nodeDialer: nodeDialer, 576 nodeLiveness: nodeLiveness, 577 storePool: storePool, 578 tcsFactory: tcsFactory, 579 distSender: distSender, 580 db: db, 581 node: node, 582 registry: registry, 583 recorder: recorder, 584 runtime: runtimeSampler, 585 admin: sAdmin, 586 status: sStatus, 587 authentication: sAuth, 588 tsDB: tsDB, 589 tsServer: &sTS, 590 raftTransport: raftTransport, 591 stopper: stopper, 592 debug: debugServer, 593 replicationReporter: replicationReporter, 594 protectedtsProvider: protectedtsProvider, 595 protectedtsReconciler: protectedtsReconciler, 596 sqlServer: sqlServer, 597 } 598 return lateBoundServer, err 599 } 600 601 // ClusterSettings returns the cluster settings. 602 func (s *Server) ClusterSettings() *cluster.Settings { 603 return s.st 604 } 605 606 // AnnotateCtx is a convenience wrapper; see AmbientContext. 607 func (s *Server) AnnotateCtx(ctx context.Context) context.Context { 608 return s.cfg.AmbientCtx.AnnotateCtx(ctx) 609 } 610 611 // AnnotateCtxWithSpan is a convenience wrapper; see AmbientContext. 612 func (s *Server) AnnotateCtxWithSpan( 613 ctx context.Context, opName string, 614 ) (context.Context, opentracing.Span) { 615 return s.cfg.AmbientCtx.AnnotateCtxWithSpan(ctx, opName) 616 } 617 618 // ClusterID returns the ID of the cluster this server is a part of. 619 func (s *Server) ClusterID() uuid.UUID { 620 return s.rpcContext.ClusterID.Get() 621 } 622 623 // NodeID returns the ID of this node within its cluster. 624 func (s *Server) NodeID() roachpb.NodeID { 625 return s.node.Descriptor.NodeID 626 } 627 628 // InitialBoot returns whether this is the first time the node has booted. 629 // Only intended to help print debugging info during server startup. 630 func (s *Server) InitialBoot() bool { 631 return s.node.initialBoot 632 } 633 634 // grpcGatewayServer represents a grpc service with HTTP endpoints through GRPC 635 // gateway. 636 type grpcGatewayServer interface { 637 RegisterService(g *grpc.Server) 638 RegisterGateway( 639 ctx context.Context, 640 mux *gwruntime.ServeMux, 641 conn *grpc.ClientConn, 642 ) error 643 } 644 645 // ListenError is returned from Start when we fail to start listening on either 646 // the main Cockroach port or the HTTP port, so that the CLI can instruct the 647 // user on what might have gone wrong. 648 type ListenError struct { 649 cause error 650 Addr string 651 } 652 653 // Error implements error. 654 func (l *ListenError) Error() string { return l.cause.Error() } 655 656 // Unwrap is because ListenError is a wrapper. 657 func (l *ListenError) Unwrap() error { return l.cause } 658 659 // inspectEngines goes through engines and populates in initDiskState. It also 660 // calls SynthesizeClusterVersionFromEngines, which selects and backfills the 661 // cluster version to all initialized engines. 662 // 663 // The initDiskState returned by this method will reflect a zero NodeID if none 664 // has been assigned yet (i.e. if none of the engines is initialized). 665 func inspectEngines( 666 ctx context.Context, 667 engines []storage.Engine, 668 binaryVersion, binaryMinSupportedVersion roachpb.Version, 669 ) (*initDiskState, error) { 670 state := &initDiskState{} 671 672 for _, eng := range engines { 673 storeIdent, err := kvserver.ReadStoreIdent(ctx, eng) 674 if errors.HasType(err, (*kvserver.NotBootstrappedError)(nil)) { 675 state.newEngines = append(state.newEngines, eng) 676 continue 677 } else if err != nil { 678 return nil, err 679 } 680 681 if state.clusterID != uuid.Nil && state.clusterID != storeIdent.ClusterID { 682 return nil, errors.Errorf("conflicting store ClusterIDs: %s, %s", storeIdent.ClusterID, state.clusterID) 683 } 684 state.clusterID = storeIdent.ClusterID 685 686 if storeIdent.StoreID == 0 || storeIdent.NodeID == 0 || storeIdent.ClusterID == uuid.Nil { 687 return nil, errors.Errorf("partially initialized store: %+v", storeIdent) 688 } 689 690 if state.nodeID != 0 && state.nodeID != storeIdent.NodeID { 691 return nil, errors.Errorf("conflicting store NodeIDs: %s, %s", storeIdent.NodeID, state.nodeID) 692 } 693 state.nodeID = storeIdent.NodeID 694 695 state.initializedEngines = append(state.initializedEngines, eng) 696 } 697 698 cv, err := kvserver.SynthesizeClusterVersionFromEngines(ctx, state.initializedEngines, binaryVersion, binaryMinSupportedVersion) 699 if err != nil { 700 return nil, err 701 } 702 state.clusterVersion = cv 703 return state, nil 704 } 705 706 // listenerInfo is a helper used to write files containing various listener 707 // information to the store directories. In contrast to the "listening url 708 // file", these are written once the listeners are available, before the server 709 // is necessarily ready to serve. 710 type listenerInfo struct { 711 listenRPC string // the (RPC) listen address, rewritten after name resolution and port allocation 712 advertiseRPC string // contains the original addr part of --listen/--advertise, with actual port number after port allocation if original was 0 713 listenHTTP string // the HTTP endpoint 714 listenSQL string // the SQL endpoint, rewritten after name resolution and port allocation 715 advertiseSQL string // contains the original addr part of --sql-addr, with actual port number after port allocation if original was 0 716 } 717 718 // Iter returns a mapping of file names to desired contents. 719 func (li listenerInfo) Iter() map[string]string { 720 return map[string]string{ 721 "cockroach.advertise-addr": li.advertiseRPC, 722 "cockroach.http-addr": li.listenHTTP, 723 "cockroach.listen-addr": li.listenRPC, 724 "cockroach.sql-addr": li.listenSQL, 725 "cockroach.advertise-sql-addr": li.advertiseSQL, 726 } 727 } 728 729 // startMonitoringForwardClockJumps starts a background task to monitor forward 730 // clock jumps based on a cluster setting 731 func (s *Server) startMonitoringForwardClockJumps(ctx context.Context) error { 732 forwardJumpCheckEnabled := make(chan bool, 1) 733 s.stopper.AddCloser(stop.CloserFn(func() { close(forwardJumpCheckEnabled) })) 734 735 forwardClockJumpCheckEnabled.SetOnChange(&s.st.SV, func() { 736 forwardJumpCheckEnabled <- forwardClockJumpCheckEnabled.Get(&s.st.SV) 737 }) 738 739 if err := s.clock.StartMonitoringForwardClockJumps( 740 ctx, 741 forwardJumpCheckEnabled, 742 time.NewTicker, 743 nil, /* tick callback */ 744 ); err != nil { 745 return errors.Wrap(err, "monitoring forward clock jumps") 746 } 747 748 log.Info(ctx, "monitoring forward clock jumps based on server.clock.forward_jump_check_enabled") 749 return nil 750 } 751 752 // ensureClockMonotonicity sleeps till the wall time reaches 753 // prevHLCUpperBound. prevHLCUpperBound > 0 implies we need to guarantee HLC 754 // monotonicity across server restarts. prevHLCUpperBound is the last 755 // successfully persisted timestamp greater then any wall time used by the 756 // server. 757 // 758 // If prevHLCUpperBound is 0, the function sleeps up to max offset 759 func ensureClockMonotonicity( 760 ctx context.Context, 761 clock *hlc.Clock, 762 startTime time.Time, 763 prevHLCUpperBound int64, 764 sleepUntilFn func(until int64, currTime func() int64), 765 ) { 766 var sleepUntil int64 767 if prevHLCUpperBound != 0 { 768 // Sleep until previous HLC upper bound to ensure wall time monotonicity 769 sleepUntil = prevHLCUpperBound + 1 770 } else { 771 // Previous HLC Upper bound is not known 772 // We might have to sleep a bit to protect against this node producing non- 773 // monotonic timestamps. Before restarting, its clock might have been driven 774 // by other nodes' fast clocks, but when we restarted, we lost all this 775 // information. For example, a client might have written a value at a 776 // timestamp that's in the future of the restarted node's clock, and if we 777 // don't do something, the same client's read would not return the written 778 // value. So, we wait up to MaxOffset; we couldn't have served timestamps more 779 // than MaxOffset in the future (assuming that MaxOffset was not changed, see 780 // #9733). 781 // 782 // As an optimization for tests, we don't sleep if all the stores are brand 783 // new. In this case, the node will not serve anything anyway until it 784 // synchronizes with other nodes. 785 sleepUntil = startTime.UnixNano() + int64(clock.MaxOffset()) + 1 786 } 787 788 currentWallTimeFn := func() int64 { /* function to report current time */ 789 return clock.Now().WallTime 790 } 791 currentWallTime := currentWallTimeFn() 792 delta := time.Duration(sleepUntil - currentWallTime) 793 if delta > 0 { 794 log.Infof( 795 ctx, 796 "Sleeping till wall time %v to catches up to %v to ensure monotonicity. Delta: %v", 797 currentWallTime, 798 sleepUntil, 799 delta, 800 ) 801 sleepUntilFn(sleepUntil, currentWallTimeFn) 802 } 803 } 804 805 // periodicallyPersistHLCUpperBound periodically persists an upper bound of 806 // the HLC's wall time. The interval for persisting is read from 807 // persistHLCUpperBoundIntervalCh. An interval of 0 disables persisting. 808 // 809 // persistHLCUpperBoundFn is used to persist the hlc upper bound, and should 810 // return an error if the persist fails. 811 // 812 // tickerFn is used to create the ticker used for persisting 813 // 814 // tickCallback is called whenever a tick is processed 815 func periodicallyPersistHLCUpperBound( 816 clock *hlc.Clock, 817 persistHLCUpperBoundIntervalCh chan time.Duration, 818 persistHLCUpperBoundFn func(int64) error, 819 tickerFn func(d time.Duration) *time.Ticker, 820 stopCh <-chan struct{}, 821 tickCallback func(), 822 ) { 823 // Create a ticker which can be used in selects. 824 // This ticker is turned on / off based on persistHLCUpperBoundIntervalCh 825 ticker := tickerFn(time.Hour) 826 ticker.Stop() 827 828 // persistInterval is the interval used for persisting the 829 // an upper bound of the HLC 830 var persistInterval time.Duration 831 var ok bool 832 833 persistHLCUpperBound := func() { 834 if err := clock.RefreshHLCUpperBound( 835 persistHLCUpperBoundFn, 836 int64(persistInterval*3), /* delta to compute upper bound */ 837 ); err != nil { 838 log.Fatalf( 839 context.Background(), 840 "error persisting HLC upper bound: %v", 841 err, 842 ) 843 } 844 } 845 846 for { 847 select { 848 case persistInterval, ok = <-persistHLCUpperBoundIntervalCh: 849 ticker.Stop() 850 if !ok { 851 return 852 } 853 854 if persistInterval > 0 { 855 ticker = tickerFn(persistInterval) 856 persistHLCUpperBound() 857 log.Info(context.Background(), "persisting HLC upper bound is enabled") 858 } else { 859 if err := clock.ResetHLCUpperBound(persistHLCUpperBoundFn); err != nil { 860 log.Fatalf( 861 context.Background(), 862 "error resetting hlc upper bound: %v", 863 err, 864 ) 865 } 866 log.Info(context.Background(), "persisting HLC upper bound is disabled") 867 } 868 869 case <-ticker.C: 870 if persistInterval > 0 { 871 persistHLCUpperBound() 872 } 873 874 case <-stopCh: 875 ticker.Stop() 876 return 877 } 878 879 if tickCallback != nil { 880 tickCallback() 881 } 882 } 883 } 884 885 // startPersistingHLCUpperBound starts a goroutine to persist an upper bound 886 // to the HLC. 887 // 888 // persistHLCUpperBoundFn is used to persist upper bound of the HLC, and should 889 // return an error if the persist fails 890 // 891 // tickerFn is used to create a new ticker 892 // 893 // tickCallback is called whenever persistHLCUpperBoundCh or a ticker tick is 894 // processed 895 func (s *Server) startPersistingHLCUpperBound( 896 ctx context.Context, 897 hlcUpperBoundExists bool, 898 persistHLCUpperBoundFn func(int64) error, 899 tickerFn func(d time.Duration) *time.Ticker, 900 ) error { 901 persistHLCUpperBoundIntervalCh := make(chan time.Duration, 1) 902 persistHLCUpperBoundInterval.SetOnChange(&s.st.SV, func() { 903 persistHLCUpperBoundIntervalCh <- persistHLCUpperBoundInterval.Get(&s.st.SV) 904 }) 905 906 if hlcUpperBoundExists { 907 // The feature to persist upper bounds to wall times is enabled. 908 // Persist a new upper bound to continue guaranteeing monotonicity 909 // Going forward the goroutine launched below will take over persisting 910 // the upper bound 911 if err := s.clock.RefreshHLCUpperBound( 912 persistHLCUpperBoundFn, 913 int64(5*time.Second), 914 ); err != nil { 915 return errors.Wrap(err, "refreshing HLC upper bound") 916 } 917 } 918 919 s.stopper.RunWorker( 920 ctx, 921 func(context.Context) { 922 periodicallyPersistHLCUpperBound( 923 s.clock, 924 persistHLCUpperBoundIntervalCh, 925 persistHLCUpperBoundFn, 926 tickerFn, 927 s.stopper.ShouldStop(), 928 nil, /* tick callback */ 929 ) 930 }, 931 ) 932 return nil 933 } 934 935 // Start starts the server on the specified port, starts gossip and initializes 936 // the node using the engines from the server's context. This is complex since 937 // it sets up the listeners and the associated port muxing, but especially since 938 // it has to solve the "bootstrapping problem": nodes need to connect to Gossip 939 // fairly early, but what drives Gossip connectivity are the first range 940 // replicas in the kv store. This in turn suggests opening the Gossip server 941 // early. However, naively doing so also serves most other services prematurely, 942 // which exposes a large surface of potentially underinitialized services. This 943 // is avoided with some additional complexity that can be summarized as follows: 944 // 945 // - before blocking trying to connect to the Gossip network, we already open 946 // the admin UI (so that its diagnostics are available) 947 // - we also allow our Gossip and our connection health Ping service 948 // - everything else returns Unavailable errors (which are retryable) 949 // - once the node has started, unlock all RPCs. 950 // 951 // The passed context can be used to trace the server startup. The context 952 // should represent the general startup operation. 953 func (s *Server) Start(ctx context.Context) error { 954 ctx = s.AnnotateCtx(ctx) 955 956 // Start the time sanity checker. 957 s.startTime = timeutil.Now() 958 if err := s.startMonitoringForwardClockJumps(ctx); err != nil { 959 return err 960 } 961 962 // Connect the node as loopback handler for RPC requests to the 963 // local node. 964 s.rpcContext.SetLocalInternalServer(s.node) 965 966 // Load the TLS configuration for the HTTP server. 967 uiTLSConfig, err := s.cfg.GetUIServerTLSConfig() 968 if err != nil { 969 return err 970 } 971 972 // connManager tracks incoming connections accepted via listeners 973 // and automatically closes them when the stopper indicates a 974 // shutdown. 975 // This handles both: 976 // - HTTP connections for the admin UI with an optional TLS handshake over HTTP. 977 // - SQL client connections with a TLS handshake over TCP. 978 // (gRPC connections are handled separately via s.grpc and perform 979 // their TLS handshake on their own) 980 connManager := netutil.MakeServer(s.stopper, uiTLSConfig, s) 981 982 // Start a context for the asynchronous network workers. 983 workersCtx := s.AnnotateCtx(context.Background()) 984 985 // Start the admin UI server. This opens the HTTP listen socket, 986 // optionally sets up TLS, and dispatches the server worker for the 987 // web UI. 988 if err := s.startServeUI(ctx, workersCtx, connManager, uiTLSConfig); err != nil { 989 return err 990 } 991 992 s.engines, err = s.cfg.CreateEngines(ctx) 993 if err != nil { 994 return errors.Wrap(err, "failed to create engines") 995 } 996 s.stopper.AddCloser(&s.engines) 997 998 bootstrapVersion := s.cfg.Settings.Version.BinaryVersion() 999 if knobs := s.cfg.TestingKnobs.Server; knobs != nil { 1000 if ov := knobs.(*TestingKnobs).BootstrapVersionOverride; ov != (roachpb.Version{}) { 1001 bootstrapVersion = ov 1002 } 1003 } 1004 1005 // Set up the init server. We have to do this relatively early because we 1006 // can't call RegisterInitServer() after `grpc.Serve`, which is called in 1007 // startRPCServer (and for the loopback grpc-gw connection). 1008 initServer, err := setupInitServer( 1009 ctx, 1010 s.cfg.Settings.Version.BinaryVersion(), 1011 s.cfg.Settings.Version.BinaryMinSupportedVersion(), 1012 bootstrapVersion, 1013 &s.cfg.DefaultZoneConfig, 1014 &s.cfg.DefaultSystemZoneConfig, 1015 s.engines, 1016 ) 1017 if err != nil { 1018 return err 1019 } 1020 1021 { 1022 // Set up the callback that persists gossiped version bumps to the 1023 // engines. The invariant we uphold here is that the bump needs to be 1024 // persisted on all engines before it becomes "visible" to the version 1025 // setting. To this end, 1026 // 1027 // a) make sure Gossip is not started yet, and 1028 // b) set up the BeforeChange callback on the version setting to persist 1029 // incoming updates to all engines. 1030 // c) write back the disk-loaded cluster version to all engines, 1031 // d) initialize the version setting (with the disk-loaded version). 1032 // 1033 // Note that "all engines" means "all engines", not "all initialized 1034 // engines". We cannot initialize engines this early in the boot 1035 // sequence. 1036 s.gossip.AssertNotStarted(ctx) 1037 1038 // Serialize the callback through a mutex to make sure we're not 1039 // clobbering the disk state if callback gets fired off concurrently. 1040 var mu syncutil.Mutex 1041 cb := func(ctx context.Context, newCV clusterversion.ClusterVersion) { 1042 mu.Lock() 1043 defer mu.Unlock() 1044 v := s.cfg.Settings.Version 1045 prevCV, err := kvserver.SynthesizeClusterVersionFromEngines( 1046 ctx, s.engines, v.BinaryVersion(), v.BinaryMinSupportedVersion(), 1047 ) 1048 if err != nil { 1049 log.Fatalf(ctx, "%v", err) 1050 } 1051 if !prevCV.Version.Less(newCV.Version) { 1052 // If nothing needs to be updated, don't do anything. The 1053 // callbacks fire async (or at least we want to assume the worst 1054 // case in which they do) and so an old update might happen 1055 // after a new one. 1056 return 1057 } 1058 if err := kvserver.WriteClusterVersionToEngines(ctx, s.engines, newCV); err != nil { 1059 log.Fatalf(ctx, "%v", err) 1060 } 1061 log.Infof(ctx, "active cluster version is now %s (up from %s)", newCV, prevCV) 1062 } 1063 clusterversion.SetBeforeChange(ctx, &s.cfg.Settings.SV, cb) 1064 1065 diskClusterVersion := initServer.DiskClusterVersion() 1066 // The version setting loaded from disk is the maximum cluster version 1067 // seen on any engine. If new stores are being added to the server right 1068 // now, or if the process crashed earlier half-way through the callback, 1069 // that version won't be on all engines. For that reason, we backfill 1070 // once. 1071 if err := kvserver.WriteClusterVersionToEngines( 1072 ctx, s.engines, diskClusterVersion, 1073 ); err != nil { 1074 return err 1075 } 1076 1077 // NB: if we bootstrap a new server (in initServer.ServeAndWait below) 1078 // we will call Initialize a second time, to eagerly move it to the 1079 // bootstrap version (from the min supported version). Initialize() 1080 // tolerates that. Note that in that case we know that the callback 1081 // has not fired yet, since Gossip won't connect (to itself) until 1082 // the server starts and so the callback will never fire prior to 1083 // that second Initialize() call. Note also that at this point in 1084 // the code we don't know if we'll bootstrap or join an existing 1085 // cluster, so we have to conservatively go with the version from 1086 // disk, which in the case of no initialized engines is the binary 1087 // min supported version. 1088 if err := clusterversion.Initialize(ctx, diskClusterVersion.Version, &s.cfg.Settings.SV); err != nil { 1089 return err 1090 } 1091 1092 // At this point, we've established the invariant: all engines hold the 1093 // version currently visible to the setting. And we have the callback in 1094 // place that will persist an incoming updated version on all engines 1095 // before making it visible to the setting. 1096 } 1097 1098 serverpb.RegisterInitServer(s.grpc.Server, initServer) 1099 1100 s.node.startAssertEngineHealth(ctx, s.engines) 1101 1102 // Start the RPC server. This opens the RPC/SQL listen socket, 1103 // and dispatches the server worker for the RPC. 1104 // The SQL listener is returned, to start the SQL server later 1105 // below when the server has initialized. 1106 pgL, startRPCServer, err := s.startListenRPCAndSQL(ctx, workersCtx) 1107 if err != nil { 1108 return err 1109 } 1110 1111 if s.cfg.TestingKnobs.Server != nil { 1112 knobs := s.cfg.TestingKnobs.Server.(*TestingKnobs) 1113 if knobs.SignalAfterGettingRPCAddress != nil { 1114 close(knobs.SignalAfterGettingRPCAddress) 1115 } 1116 if knobs.PauseAfterGettingRPCAddress != nil { 1117 <-knobs.PauseAfterGettingRPCAddress 1118 } 1119 } 1120 1121 // Enable the debug endpoints first to provide an earlier window into what's 1122 // going on with the node in advance of exporting node functionality. 1123 // 1124 // TODO(marc): when cookie-based authentication exists, apply it to all web 1125 // endpoints. 1126 s.mux.Handle(debug.Endpoint, s.debug) 1127 1128 // Initialize grpc-gateway mux and context in order to get the /health 1129 // endpoint working even before the node has fully initialized. 1130 jsonpb := &protoutil.JSONPb{ 1131 EnumsAsInts: true, 1132 EmitDefaults: true, 1133 Indent: " ", 1134 } 1135 protopb := new(protoutil.ProtoPb) 1136 gwMux := gwruntime.NewServeMux( 1137 gwruntime.WithMarshalerOption(gwruntime.MIMEWildcard, jsonpb), 1138 gwruntime.WithMarshalerOption(httputil.JSONContentType, jsonpb), 1139 gwruntime.WithMarshalerOption(httputil.AltJSONContentType, jsonpb), 1140 gwruntime.WithMarshalerOption(httputil.ProtoContentType, protopb), 1141 gwruntime.WithMarshalerOption(httputil.AltProtoContentType, protopb), 1142 gwruntime.WithOutgoingHeaderMatcher(authenticationHeaderMatcher), 1143 gwruntime.WithMetadata(forwardAuthenticationMetadata), 1144 ) 1145 gwCtx, gwCancel := context.WithCancel(s.AnnotateCtx(context.Background())) 1146 s.stopper.AddCloser(stop.CloserFn(gwCancel)) 1147 1148 // loopback handles the HTTP <-> RPC loopback connection. 1149 loopback := newLoopbackListener(workersCtx, s.stopper) 1150 1151 s.stopper.RunWorker(workersCtx, func(workersCtx context.Context) { 1152 <-s.stopper.ShouldQuiesce() 1153 _ = loopback.Close() 1154 }) 1155 1156 s.stopper.RunWorker(workersCtx, func(context.Context) { 1157 netutil.FatalIfUnexpected(s.grpc.Serve(loopback)) 1158 }) 1159 1160 // Eschew `(*rpc.Context).GRPCDial` to avoid unnecessary moving parts on the 1161 // uniquely in-process connection. 1162 dialOpts, err := s.rpcContext.GRPCDialOptions() 1163 if err != nil { 1164 return err 1165 } 1166 conn, err := grpc.DialContext(ctx, s.cfg.AdvertiseAddr, append( 1167 dialOpts, 1168 grpc.WithContextDialer(func(ctx context.Context, _ string) (net.Conn, error) { 1169 return loopback.Connect(ctx) 1170 }), 1171 )...) 1172 if err != nil { 1173 return err 1174 } 1175 s.stopper.RunWorker(workersCtx, func(workersCtx context.Context) { 1176 <-s.stopper.ShouldQuiesce() 1177 if err := conn.Close(); err != nil { 1178 log.Fatalf(workersCtx, "%v", err) 1179 } 1180 }) 1181 1182 for _, gw := range []grpcGatewayServer{s.admin, s.status, s.authentication, s.tsServer} { 1183 if err := gw.RegisterGateway(gwCtx, gwMux, conn); err != nil { 1184 return err 1185 } 1186 } 1187 // Handle /health early. This is necessary for orchestration. Note 1188 // that /health is not authenticated, on purpose. This is both 1189 // because it needs to be available before the cluster is up and can 1190 // serve authentication requests, and also because it must work for 1191 // monitoring tools which operate without authentication. 1192 s.mux.Handle("/health", gwMux) 1193 1194 // Write listener info files early in the startup sequence. `listenerInfo` has a comment. 1195 listenerFiles := listenerInfo{ 1196 listenRPC: s.cfg.Addr, 1197 advertiseRPC: s.cfg.AdvertiseAddr, 1198 listenSQL: s.cfg.SQLAddr, 1199 advertiseSQL: s.cfg.SQLAdvertiseAddr, 1200 listenHTTP: s.cfg.HTTPAdvertiseAddr, 1201 }.Iter() 1202 1203 for _, storeSpec := range s.cfg.Stores.Specs { 1204 if storeSpec.InMemory { 1205 continue 1206 } 1207 1208 for name, val := range listenerFiles { 1209 file := filepath.Join(storeSpec.Path, name) 1210 if err := ioutil.WriteFile(file, []byte(val), 0644); err != nil { 1211 return errors.Wrapf(err, "failed to write %s", file) 1212 } 1213 } 1214 } 1215 1216 // Filter the gossip bootstrap resolvers based on the listen and 1217 // advertise addresses. 1218 listenAddrU := util.NewUnresolvedAddr("tcp", s.cfg.Addr) 1219 advAddrU := util.NewUnresolvedAddr("tcp", s.cfg.AdvertiseAddr) 1220 advSQLAddrU := util.NewUnresolvedAddr("tcp", s.cfg.SQLAdvertiseAddr) 1221 filtered := s.cfg.FilterGossipBootstrapResolvers(ctx, listenAddrU, advAddrU) 1222 1223 s.gossip.Start(advAddrU, filtered) 1224 log.Event(ctx, "started gossip") 1225 1226 if s.cfg.DelayedBootstrapFn != nil { 1227 defer time.AfterFunc(30*time.Second, s.cfg.DelayedBootstrapFn).Stop() 1228 } 1229 1230 // Set up calling s.cfg.ReadyFn at the right time. Essentially, this call 1231 // determines when `./cockroach [...] --background` returns. For any initialized 1232 // nodes (i.e. already part of a cluster) this is when this method returns 1233 // (assuming there's no error). For nodes that need to join a cluster, we 1234 // return once the initServer is ready to accept requests. 1235 var onSuccessfulReturnFn, onInitServerReady func() 1236 selfBootstrap := initServer.NeedsInit() && len(s.cfg.GossipBootstrapResolvers) == 0 1237 if selfBootstrap { 1238 // If a new node is started without join flags, self-bootstrap. 1239 // 1240 // Note: this is behavior slated for removal, see: 1241 // https://github.com/cockroachdb/cockroach/pull/44112 1242 _, err := initServer.Bootstrap(ctx, &serverpb.BootstrapRequest{}) 1243 switch { 1244 case err == nil: 1245 log.Infof(ctx, "**** add additional nodes by specifying --join=%s", s.cfg.AdvertiseAddr) 1246 case errors.Is(err, ErrClusterInitialized): 1247 default: 1248 // Process is shutting down. 1249 } 1250 } 1251 { 1252 readyFn := func(bool) {} 1253 if s.cfg.ReadyFn != nil { 1254 readyFn = s.cfg.ReadyFn 1255 } 1256 if !initServer.NeedsInit() || selfBootstrap { 1257 onSuccessfulReturnFn = func() { readyFn(false /* waitForInit */) } 1258 onInitServerReady = func() {} 1259 } else { 1260 onSuccessfulReturnFn = func() {} 1261 onInitServerReady = func() { readyFn(true /* waitForInit */) } 1262 } 1263 } 1264 1265 // This opens the main listener. When the listener is open, we can call 1266 // initServerReadyFn since any request initated to the initServer at that 1267 // point will reach it once ServeAndWait starts handling the queue of incoming 1268 // connections. 1269 startRPCServer(workersCtx) 1270 onInitServerReady() 1271 state, err := initServer.ServeAndWait(ctx, s.stopper, &s.cfg.Settings.SV, s.gossip) 1272 if err != nil { 1273 return errors.Wrap(err, "during init") 1274 } 1275 1276 s.rpcContext.ClusterID.Set(ctx, state.clusterID) 1277 // If there's no NodeID here, then we didn't just bootstrap. The Node will 1278 // read its ID from the stores or request a new one via KV. 1279 if state.nodeID != 0 { 1280 s.rpcContext.NodeID.Set(ctx, state.nodeID) 1281 } 1282 1283 // TODO(tbg): split this method here. Everything above this comment is 1284 // the early stage of startup -- setting up listeners and determining the 1285 // initState -- and everything after it is actually starting the server, 1286 // using the listeners and init state. 1287 1288 // Defense in depth: set up an eager sanity check that we're not 1289 // accidentally being pointed at a different cluster. We have checks for 1290 // this in the RPC layer, but since the RPC layer gets set up before the 1291 // clusterID is known, early connections won't validate the clusterID (at 1292 // least not until the next Ping). 1293 // 1294 // The check is simple: listen for clusterID changes from Gossip. If we see 1295 // one, make sure it's the clusterID we already know (and are guaranteed to 1296 // know) at this point. If it's not the same, explode. 1297 // 1298 // TODO(tbg): remove this when we have changed ServeAndWait() to join an 1299 // existing cluster via a one-off RPC, at which point we can create gossip 1300 // (and thus the RPC layer) only after the clusterID is already known. We 1301 // can then rely on the RPC layer's protection against cross-cluster 1302 // communication. 1303 { 1304 // We populated this above, so it should still be set. This is just to 1305 // demonstrate that we're not doing anything functional here (and to 1306 // prevent bugs during further refactors). 1307 if s.rpcContext.ClusterID.Get() == uuid.Nil { 1308 return errors.New("gossip should already be connected") 1309 } 1310 unregister := s.gossip.RegisterCallback(gossip.KeyClusterID, func(string, roachpb.Value) { 1311 clusterID, err := s.gossip.GetClusterID() 1312 if err != nil { 1313 log.Fatalf(ctx, "unable to read ClusterID: %v", err) 1314 } 1315 s.rpcContext.ClusterID.Set(ctx, clusterID) // fatals on mismatch 1316 }) 1317 defer unregister() 1318 } 1319 1320 // Spawn a goroutine that will print a nice message when Gossip connects. 1321 // Note that we already know the clusterID, but we don't know that Gossip 1322 // has connected. The pertinent case is that of restarting an entire 1323 // cluster. Someone has to gossip the ClusterID before Gossip is connected, 1324 // but this gossip only happens once the first range has a leaseholder, i.e. 1325 // when a quorum of nodes has gone fully operational. 1326 _ = s.stopper.RunAsyncTask(ctx, "connect-gossip", func(ctx context.Context) { 1327 log.Infof(ctx, "connecting to gossip network to verify cluster ID %q", state.clusterID) 1328 select { 1329 case <-s.gossip.Connected: 1330 log.Infof(ctx, "node connected via gossip") 1331 case <-ctx.Done(): 1332 case <-s.stopper.ShouldQuiesce(): 1333 } 1334 }) 1335 1336 // NB: if this store is freshly bootstrapped (or no upper bound was 1337 // persisted), hlcUpperBound will be zero. 1338 hlcUpperBound, err := kvserver.ReadMaxHLCUpperBound(ctx, s.engines) 1339 if err != nil { 1340 return errors.Wrap(err, "reading max HLC upper bound") 1341 } 1342 1343 if hlcUpperBound > 0 { 1344 ensureClockMonotonicity( 1345 ctx, 1346 s.clock, 1347 s.startTime, 1348 hlcUpperBound, 1349 timeutil.SleepUntil, 1350 ) 1351 } 1352 1353 // Record a walltime that is lower than the lowest hlc timestamp this current 1354 // instance of the node can use. We do not use startTime because it is lower 1355 // than the timestamp used to create the bootstrap schema. 1356 // 1357 // TODO(tbg): clarify the contract here and move closer to usage if possible. 1358 orphanedLeasesTimeThresholdNanos := s.clock.Now().WallTime 1359 1360 onSuccessfulReturnFn() 1361 1362 // Now that we have a monotonic HLC wrt previous incarnations of the process, 1363 // init all the replicas. At this point *some* store has been bootstrapped or 1364 // we're joining an existing cluster for the first time. 1365 if err := s.node.start( 1366 ctx, 1367 advAddrU, advSQLAddrU, 1368 *state, 1369 s.cfg.ClusterName, 1370 s.cfg.NodeAttributes, 1371 s.cfg.Locality, 1372 s.cfg.LocalityAddresses, 1373 s.sqlServer.execCfg.DistSQLPlanner.SetNodeDesc, 1374 ); err != nil { 1375 return err 1376 } 1377 1378 log.Event(ctx, "started node") 1379 if err := s.startPersistingHLCUpperBound( 1380 ctx, 1381 hlcUpperBound > 0, 1382 func(t int64) error { /* function to persist upper bound of HLC to all stores */ 1383 return s.node.SetHLCUpperBound(context.Background(), t) 1384 }, 1385 time.NewTicker, 1386 ); err != nil { 1387 return err 1388 } 1389 s.replicationReporter.Start(ctx, s.stopper) 1390 1391 s.refreshSettings() 1392 1393 sentry.ConfigureScope(func(scope *sentry.Scope) { 1394 scope.SetTags(map[string]string{ 1395 "cluster": s.ClusterID().String(), 1396 "node": s.NodeID().String(), 1397 "server_id": fmt.Sprintf("%s-%s", s.ClusterID().Short(), s.NodeID()), 1398 "engine_type": s.cfg.StorageEngine.String(), 1399 }) 1400 }) 1401 1402 // We can now add the node registry. 1403 s.recorder.AddNode(s.registry, s.node.Descriptor, s.node.startedAt, s.cfg.AdvertiseAddr, s.cfg.HTTPAdvertiseAddr, s.cfg.SQLAdvertiseAddr) 1404 1405 // Begin recording runtime statistics. 1406 if err := s.startSampleEnvironment(ctx, base.DefaultMetricsSampleInterval); err != nil { 1407 return err 1408 } 1409 1410 // Begin recording time series data collected by the status monitor. 1411 s.tsDB.PollSource( 1412 s.cfg.AmbientCtx, s.recorder, base.DefaultMetricsSampleInterval, ts.Resolution10s, s.stopper, 1413 ) 1414 1415 var graphiteOnce sync.Once 1416 graphiteEndpoint.SetOnChange(&s.st.SV, func() { 1417 if graphiteEndpoint.Get(&s.st.SV) != "" { 1418 graphiteOnce.Do(func() { 1419 s.node.startGraphiteStatsExporter(s.st) 1420 }) 1421 } 1422 }) 1423 1424 s.grpc.setMode(modeOperational) 1425 1426 log.Infof(ctx, "starting %s server at %s (use: %s)", 1427 s.cfg.HTTPRequestScheme(), s.cfg.HTTPAddr, s.cfg.HTTPAdvertiseAddr) 1428 rpcConnType := "grpc/postgres" 1429 if s.cfg.SplitListenSQL { 1430 rpcConnType = "grpc" 1431 log.Infof(ctx, "starting postgres server at %s (use: %s)", s.cfg.SQLAddr, s.cfg.SQLAdvertiseAddr) 1432 } 1433 log.Infof(ctx, "starting %s server at %s", rpcConnType, s.cfg.Addr) 1434 log.Infof(ctx, "advertising CockroachDB node at %s", s.cfg.AdvertiseAddr) 1435 1436 log.Event(ctx, "accepting connections") 1437 1438 if state.bootstrapped { 1439 // If a new cluster is just starting up, force all the system ranges 1440 // through the replication queue so they upreplicate as quickly as 1441 // possible when a new node joins. Without this code, the upreplication 1442 // would be up to the whim of the scanner, which might be too slow for 1443 // new clusters. 1444 // TODO(tbg): instead of this dubious band-aid we should make the 1445 // replication queue reactive enough to avoid relying on the scanner 1446 // alone. 1447 var done bool 1448 return s.node.stores.VisitStores(func(store *kvserver.Store) error { 1449 if !done { 1450 done = true 1451 return store.ForceReplicationScanAndProcess() 1452 } 1453 return nil 1454 }) 1455 } 1456 1457 // Begin the node liveness heartbeat. Add a callback which records the local 1458 // store "last up" timestamp for every store whenever the liveness record is 1459 // updated. 1460 s.nodeLiveness.StartHeartbeat(ctx, s.stopper, s.engines, func(ctx context.Context) { 1461 now := s.clock.Now() 1462 if err := s.node.stores.VisitStores(func(s *kvserver.Store) error { 1463 return s.WriteLastUpTimestamp(ctx, now) 1464 }); err != nil { 1465 log.Warningf(ctx, "writing last up timestamp: %v", err) 1466 } 1467 }) 1468 1469 // Begin recording status summaries. 1470 s.node.startWriteNodeStatus(base.DefaultMetricsSampleInterval) 1471 1472 // Start the protected timestamp subsystem. 1473 if err := s.protectedtsProvider.Start(ctx, s.stopper); err != nil { 1474 return err 1475 } 1476 if err := s.protectedtsReconciler.Start(ctx, s.stopper); err != nil { 1477 return err 1478 } 1479 1480 // Start garbage collecting system events. 1481 // 1482 // NB: As written, this falls awkwardly between SQL and KV. KV is used only 1483 // to make sure this runs only on one node. SQL is used to actually GC. We 1484 // count it as a KV operation since it grooms cluster-wide data, not 1485 // something associated to SQL tenants. 1486 s.startSystemLogsGC(ctx) 1487 1488 // Serve UI assets. 1489 // 1490 // The authentication mux used here is created in "allow anonymous" mode so that the UI 1491 // assets are served up whether or not there is a session. If there is a session, the mux 1492 // adds it to the context, and it is templated into index.html so that the UI can show 1493 // the username of the currently-logged-in user. 1494 authenticatedUIHandler := newAuthenticationMuxAllowAnonymous( 1495 s.authentication, 1496 ui.Handler(ui.Config{ 1497 ExperimentalUseLogin: s.cfg.EnableWebSessionAuthentication, 1498 LoginEnabled: s.cfg.RequireWebSession(), 1499 NodeID: s.nodeIDContainer, 1500 GetUser: func(ctx context.Context) *string { 1501 if u, ok := ctx.Value(webSessionUserKey{}).(string); ok { 1502 return &u 1503 } 1504 return nil 1505 }, 1506 }), 1507 ) 1508 s.mux.Handle("/", authenticatedUIHandler) 1509 1510 // Register gRPC-gateway endpoints used by the admin UI. 1511 var authHandler http.Handler = gwMux 1512 if s.cfg.RequireWebSession() { 1513 authHandler = newAuthenticationMux(s.authentication, authHandler) 1514 } 1515 1516 s.mux.Handle(adminPrefix, authHandler) 1517 // Exempt the health check endpoint from authentication. 1518 // This mirrors the handling of /health above. 1519 s.mux.Handle("/_admin/v1/health", gwMux) 1520 s.mux.Handle(ts.URLPrefix, authHandler) 1521 s.mux.Handle(statusPrefix, authHandler) 1522 // The /login endpoint is, by definition, available pre-authentication. 1523 s.mux.Handle(loginPath, gwMux) 1524 s.mux.Handle(logoutPath, authHandler) 1525 // The /_status/vars endpoint is not authenticated either. Useful for monitoring. 1526 s.mux.Handle(statusVars, http.HandlerFunc(s.status.handleVars)) 1527 log.Event(ctx, "added http endpoints") 1528 1529 // Attempt to upgrade cluster version. 1530 s.startAttemptUpgrade(ctx) 1531 1532 // Record node start in telemetry. Get the right counter for this storage 1533 // engine type as well as type of start (initial boot vs restart). 1534 nodeStartCounter := "storage.engine." 1535 switch s.cfg.StorageEngine { 1536 case enginepb.EngineTypePebble: 1537 nodeStartCounter += "pebble." 1538 case enginepb.EngineTypeDefault: 1539 nodeStartCounter += "default." 1540 case enginepb.EngineTypeRocksDB: 1541 nodeStartCounter += "rocksdb." 1542 case enginepb.EngineTypeTeePebbleRocksDB: 1543 nodeStartCounter += "pebble+rocksdb." 1544 } 1545 if s.InitialBoot() { 1546 nodeStartCounter += "initial-boot" 1547 } else { 1548 nodeStartCounter += "restart" 1549 } 1550 telemetry.Count(nodeStartCounter) 1551 1552 // Record that this node joined the cluster in the event log. Since this 1553 // executes a SQL query, this must be done after the SQL layer is ready. 1554 s.node.recordJoinEvent() 1555 1556 if err := s.sqlServer.start( 1557 workersCtx, 1558 s.stopper, 1559 s.cfg.TestingKnobs, 1560 connManager, 1561 pgL, 1562 s.cfg.SocketFile, 1563 orphanedLeasesTimeThresholdNanos, 1564 ); err != nil { 1565 return err 1566 } 1567 1568 if err := s.debug.RegisterEngines(s.cfg.Stores.Specs, s.engines); err != nil { 1569 return errors.Wrapf(err, "failed to register engines with debug server") 1570 } 1571 1572 log.Event(ctx, "server ready") 1573 return nil 1574 } 1575 1576 // startListenRPCAndSQL starts the RPC and SQL listeners. 1577 // It returns the SQL listener, which can be used 1578 // to start the SQL server when initialization has completed. 1579 // It also returns a function that starts the RPC server, 1580 // when the cluster is known to have bootstrapped or 1581 // when waiting for init(). 1582 func (s *Server) startListenRPCAndSQL( 1583 ctx, workersCtx context.Context, 1584 ) (sqlListener net.Listener, startRPCServer func(ctx context.Context), err error) { 1585 rpcChanName := "rpc/sql" 1586 if s.cfg.SplitListenSQL { 1587 rpcChanName = "rpc" 1588 } 1589 var ln net.Listener 1590 if k := s.cfg.TestingKnobs.Server; k != nil { 1591 knobs := k.(*TestingKnobs) 1592 ln = knobs.RPCListener 1593 } 1594 if ln == nil { 1595 var err error 1596 ln, err = listen(ctx, &s.cfg.Addr, &s.cfg.AdvertiseAddr, rpcChanName) 1597 if err != nil { 1598 return nil, nil, err 1599 } 1600 log.Eventf(ctx, "listening on port %s", s.cfg.Addr) 1601 } 1602 1603 var pgL net.Listener 1604 if s.cfg.SplitListenSQL { 1605 pgL, err = listen(ctx, &s.cfg.SQLAddr, &s.cfg.SQLAdvertiseAddr, "sql") 1606 if err != nil { 1607 return nil, nil, err 1608 } 1609 // The SQL listener shutdown worker, which closes everything under 1610 // the SQL port when the stopper indicates we are shutting down. 1611 s.stopper.RunWorker(workersCtx, func(workersCtx context.Context) { 1612 <-s.stopper.ShouldQuiesce() 1613 if err := pgL.Close(); err != nil { 1614 log.Fatalf(workersCtx, "%v", err) 1615 } 1616 }) 1617 log.Eventf(ctx, "listening on sql port %s", s.cfg.SQLAddr) 1618 } 1619 1620 // serveOnMux is used to ensure that the mux gets listened on eventually, 1621 // either via the returned startRPCServer() or upon stopping. 1622 var serveOnMux sync.Once 1623 1624 m := cmux.New(ln) 1625 1626 if !s.cfg.SplitListenSQL { 1627 // If the pg port is split, it will be opened above. Otherwise, 1628 // we make it hang off the RPC listener via cmux here. 1629 pgL = m.Match(func(r io.Reader) bool { 1630 return pgwire.Match(r) 1631 }) 1632 // Also if the pg port is not split, the actual listen 1633 // and advertise address for SQL become equal to that of RPC, 1634 // regardless of what was configured. 1635 s.cfg.SQLAddr = s.cfg.Addr 1636 s.cfg.SQLAdvertiseAddr = s.cfg.AdvertiseAddr 1637 } 1638 1639 anyL := m.Match(cmux.Any()) 1640 if serverTestKnobs, ok := s.cfg.TestingKnobs.Server.(*TestingKnobs); ok { 1641 if serverTestKnobs.ContextTestingKnobs.ArtificialLatencyMap != nil { 1642 anyL = rpc.NewDelayingListener(anyL) 1643 } 1644 } 1645 1646 // The remainder shutdown worker. 1647 s.stopper.RunWorker(workersCtx, func(context.Context) { 1648 <-s.stopper.ShouldQuiesce() 1649 // TODO(bdarnell): Do we need to also close the other listeners? 1650 netutil.FatalIfUnexpected(anyL.Close()) 1651 <-s.stopper.ShouldStop() 1652 s.grpc.Stop() 1653 serveOnMux.Do(func() { 1654 // The cmux matches don't shut down properly unless serve is called on the 1655 // cmux at some point. Use serveOnMux to ensure it's called during shutdown 1656 // if we wouldn't otherwise reach the point where we start serving on it. 1657 netutil.FatalIfUnexpected(m.Serve()) 1658 }) 1659 }) 1660 1661 // startRPCServer starts the RPC server. We do not do this 1662 // immediately because we want the cluster to be ready (or ready to 1663 // initialize) before we accept RPC requests. The caller 1664 // (Server.Start) will call this at the right moment. 1665 startRPCServer = func(ctx context.Context) { 1666 // Serve the gRPC endpoint. 1667 s.stopper.RunWorker(workersCtx, func(context.Context) { 1668 netutil.FatalIfUnexpected(s.grpc.Serve(anyL)) 1669 }) 1670 1671 s.stopper.RunWorker(ctx, func(context.Context) { 1672 serveOnMux.Do(func() { 1673 netutil.FatalIfUnexpected(m.Serve()) 1674 }) 1675 }) 1676 } 1677 1678 return pgL, startRPCServer, nil 1679 } 1680 1681 func (s *Server) startServeUI( 1682 ctx, workersCtx context.Context, connManager netutil.Server, uiTLSConfig *tls.Config, 1683 ) error { 1684 httpLn, err := listen(ctx, &s.cfg.HTTPAddr, &s.cfg.HTTPAdvertiseAddr, "http") 1685 if err != nil { 1686 return err 1687 } 1688 log.Eventf(ctx, "listening on http port %s", s.cfg.HTTPAddr) 1689 1690 // The HTTP listener shutdown worker, which closes everything under 1691 // the HTTP port when the stopper indicates we are shutting down. 1692 s.stopper.RunWorker(workersCtx, func(workersCtx context.Context) { 1693 <-s.stopper.ShouldQuiesce() 1694 if err := httpLn.Close(); err != nil { 1695 log.Fatalf(workersCtx, "%v", err) 1696 } 1697 }) 1698 1699 if uiTLSConfig != nil { 1700 httpMux := cmux.New(httpLn) 1701 clearL := httpMux.Match(cmux.HTTP1()) 1702 tlsL := httpMux.Match(cmux.Any()) 1703 1704 // Dispatch incoming requests to either clearL or tlsL. 1705 s.stopper.RunWorker(workersCtx, func(context.Context) { 1706 netutil.FatalIfUnexpected(httpMux.Serve()) 1707 }) 1708 1709 // Serve the plain HTTP (non-TLS) connection over clearL. 1710 // This produces a HTTP redirect to the `https` URL for the path /, 1711 // handles the request normally (via s.ServeHTTP) for the path /health, 1712 // and produces 404 for anything else. 1713 s.stopper.RunWorker(workersCtx, func(context.Context) { 1714 mux := http.NewServeMux() 1715 mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { 1716 http.Redirect(w, r, "https://"+r.Host+r.RequestURI, http.StatusTemporaryRedirect) 1717 }) 1718 mux.Handle("/health", s) 1719 1720 plainRedirectServer := netutil.MakeServer(s.stopper, uiTLSConfig, mux) 1721 1722 netutil.FatalIfUnexpected(plainRedirectServer.Serve(clearL)) 1723 }) 1724 1725 httpLn = tls.NewListener(tlsL, uiTLSConfig) 1726 } 1727 1728 // Serve the HTTP endpoint. This will be the original httpLn 1729 // listening on --http-addr without TLS if uiTLSConfig was 1730 // nil, or overridden above if uiTLSConfig was not nil to come from 1731 // the TLS negotiation over the HTTP port. 1732 s.stopper.RunWorker(workersCtx, func(context.Context) { 1733 netutil.FatalIfUnexpected(connManager.Serve(httpLn)) 1734 }) 1735 1736 return nil 1737 } 1738 1739 // TODO(tbg): move into server_sql.go. 1740 func (s *sqlServer) startServeSQL( 1741 ctx context.Context, 1742 stopper *stop.Stopper, 1743 connManager netutil.Server, 1744 pgL net.Listener, 1745 socketFile string, 1746 ) error { 1747 log.Info(ctx, "serving sql connections") 1748 // Start servicing SQL connections. 1749 1750 pgCtx := s.pgServer.AmbientCtx.AnnotateCtx(context.Background()) 1751 tcpKeepAlive := tcpKeepAliveManager{ 1752 tcpKeepAlive: envutil.EnvOrDefaultDuration("COCKROACH_SQL_TCP_KEEP_ALIVE", time.Minute), 1753 } 1754 1755 stopper.RunWorker(pgCtx, func(pgCtx context.Context) { 1756 netutil.FatalIfUnexpected(connManager.ServeWith(pgCtx, stopper, pgL, func(conn net.Conn) { 1757 connCtx := logtags.AddTag(pgCtx, "client", conn.RemoteAddr().String()) 1758 tcpKeepAlive.configure(connCtx, conn) 1759 1760 if err := s.pgServer.ServeConn(connCtx, conn, pgwire.SocketTCP); err != nil { 1761 log.Errorf(connCtx, "serving SQL client conn: %v", err) 1762 } 1763 })) 1764 }) 1765 1766 // If a unix socket was requested, start serving there too. 1767 if len(socketFile) != 0 { 1768 log.Infof(ctx, "starting postgres server at unix:%s", socketFile) 1769 1770 // Unix socket enabled: postgres protocol only. 1771 unixLn, err := net.Listen("unix", socketFile) 1772 if err != nil { 1773 return err 1774 } 1775 1776 stopper.RunWorker(ctx, func(workersCtx context.Context) { 1777 <-stopper.ShouldQuiesce() 1778 if err := unixLn.Close(); err != nil { 1779 log.Fatalf(workersCtx, "%v", err) 1780 } 1781 }) 1782 1783 stopper.RunWorker(pgCtx, func(pgCtx context.Context) { 1784 netutil.FatalIfUnexpected(connManager.ServeWith(pgCtx, stopper, unixLn, func(conn net.Conn) { 1785 connCtx := logtags.AddTag(pgCtx, "client", conn.RemoteAddr().String()) 1786 if err := s.pgServer.ServeConn(connCtx, conn, pgwire.SocketUnix); err != nil { 1787 log.Errorf(connCtx, "%v", err) 1788 } 1789 })) 1790 }) 1791 } 1792 return nil 1793 } 1794 1795 // Decommission idempotently sets the decommissioning flag for specified nodes. 1796 func (s *Server) Decommission(ctx context.Context, setTo bool, nodeIDs []roachpb.NodeID) error { 1797 eventLogger := sql.MakeEventLogger(s.sqlServer.execCfg) 1798 eventType := sql.EventLogNodeDecommissioned 1799 if !setTo { 1800 eventType = sql.EventLogNodeRecommissioned 1801 } 1802 for _, nodeID := range nodeIDs { 1803 changeCommitted, err := s.nodeLiveness.SetDecommissioning(ctx, nodeID, setTo) 1804 if err != nil { 1805 return errors.Wrapf(err, "during liveness update %d -> %t", nodeID, setTo) 1806 } 1807 if changeCommitted { 1808 // If we die right now or if this transaction fails to commit, the 1809 // commissioning event will not be recorded to the event log. While we 1810 // could insert the event record in the same transaction as the liveness 1811 // update, this would force a 2PC and potentially leave write intents in 1812 // the node liveness range. Better to make the event logging best effort 1813 // than to slow down future node liveness transactions. 1814 if err := s.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 1815 return eventLogger.InsertEventRecord( 1816 ctx, txn, eventType, int32(nodeID), int32(s.NodeID()), struct{}{}, 1817 ) 1818 }); err != nil { 1819 log.Errorf(ctx, "unable to record %s event for node %d: %s", eventType, nodeID, err) 1820 } 1821 } 1822 } 1823 return nil 1824 } 1825 1826 // startSampleEnvironment begins the heap profiler worker. 1827 func (s *Server) startSampleEnvironment(ctx context.Context, frequency time.Duration) error { 1828 // Immediately record summaries once on server startup. 1829 ctx = s.AnnotateCtx(ctx) 1830 1831 // We're not going to take heap profiles or goroutine dumps if 1832 // running only with in-memory stores. This helps some tests that 1833 // can't write any files. 1834 allStoresInMem := true 1835 for _, storeSpec := range s.cfg.Stores.Specs { 1836 if !storeSpec.InMemory { 1837 allStoresInMem = false 1838 break 1839 } 1840 } 1841 1842 var goroutineDumper *goroutinedumper.GoroutineDumper 1843 var heapProfiler *heapprofiler.HeapProfiler 1844 1845 if !allStoresInMem { 1846 var err error 1847 if s.cfg.GoroutineDumpDirName != "" { 1848 if err := os.MkdirAll(s.cfg.GoroutineDumpDirName, 0755); err != nil { 1849 return errors.Wrap(err, "creating goroutine dump dir") 1850 } 1851 goroutineDumper, err = goroutinedumper.NewGoroutineDumper(s.cfg.GoroutineDumpDirName) 1852 if err != nil { 1853 return errors.Wrap(err, "starting goroutine dumper worker") 1854 } 1855 } 1856 1857 if s.cfg.HeapProfileDirName != "" { 1858 if err := os.MkdirAll(s.cfg.HeapProfileDirName, 0755); err != nil { 1859 return errors.Wrap(err, "creating heap profiles dir") 1860 } 1861 heapProfiler, err = heapprofiler.NewHeapProfiler(s.cfg.HeapProfileDirName, s.ClusterSettings()) 1862 if err != nil { 1863 return errors.Wrap(err, "starting heap profiler worker") 1864 } 1865 } 1866 } 1867 1868 s.stopper.RunWorker(ctx, func(ctx context.Context) { 1869 var goMemStats atomic.Value // *status.GoMemStats 1870 goMemStats.Store(&status.GoMemStats{}) 1871 var collectingMemStats int32 // atomic, 1 when stats call is ongoing 1872 1873 timer := timeutil.NewTimer() 1874 defer timer.Stop() 1875 timer.Reset(frequency) 1876 1877 for { 1878 select { 1879 case <-s.stopper.ShouldStop(): 1880 return 1881 case <-timer.C: 1882 timer.Read = true 1883 timer.Reset(frequency) 1884 1885 // We read the heap stats on another goroutine and give up after 1s. 1886 // This is necessary because as of Go 1.12, runtime.ReadMemStats() 1887 // "stops the world" and that requires first waiting for any current GC 1888 // run to finish. With a large heap and under extreme conditions, a 1889 // single GC run may take longer than the default sampling period of 1890 // 10s. Under normal operations and with more recent versions of Go, 1891 // this hasn't been observed to be a problem. 1892 statsCollected := make(chan struct{}) 1893 if atomic.CompareAndSwapInt32(&collectingMemStats, 0, 1) { 1894 if err := s.stopper.RunAsyncTask(ctx, "get-mem-stats", func(ctx context.Context) { 1895 var ms status.GoMemStats 1896 runtime.ReadMemStats(&ms.MemStats) 1897 ms.Collected = timeutil.Now() 1898 log.VEventf(ctx, 2, "memstats: %+v", ms) 1899 1900 goMemStats.Store(&ms) 1901 atomic.StoreInt32(&collectingMemStats, 0) 1902 close(statsCollected) 1903 }); err != nil { 1904 close(statsCollected) 1905 } 1906 } 1907 1908 select { 1909 case <-statsCollected: 1910 // Good; we managed to read the Go memory stats quickly enough. 1911 case <-time.After(time.Second): 1912 } 1913 1914 curStats := goMemStats.Load().(*status.GoMemStats) 1915 s.runtime.SampleEnvironment(ctx, *curStats) 1916 if goroutineDumper != nil { 1917 goroutineDumper.MaybeDump(ctx, s.ClusterSettings(), s.runtime.Goroutines.Value()) 1918 } 1919 if heapProfiler != nil { 1920 heapProfiler.MaybeTakeProfile(ctx, curStats.MemStats) 1921 } 1922 1923 } 1924 } 1925 }) 1926 return nil 1927 } 1928 1929 // Stop stops the server. 1930 func (s *Server) Stop() { 1931 s.stopper.Stop(context.Background()) 1932 } 1933 1934 // ServeHTTP is necessary to implement the http.Handler interface. 1935 func (s *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) { 1936 // This is our base handler, so catch all panics and make sure they stick. 1937 defer log.FatalOnPanic() 1938 1939 // Disable caching of responses. 1940 w.Header().Set("Cache-control", "no-cache") 1941 1942 ae := r.Header.Get(httputil.AcceptEncodingHeader) 1943 switch { 1944 case strings.Contains(ae, httputil.GzipEncoding): 1945 w.Header().Set(httputil.ContentEncodingHeader, httputil.GzipEncoding) 1946 gzw := newGzipResponseWriter(w) 1947 defer func() { 1948 // Certain requests must not have a body, yet closing the gzip writer will 1949 // attempt to write the gzip header. Avoid logging a warning in this case. 1950 // This is notably triggered by: 1951 // 1952 // curl -H 'Accept-Encoding: gzip' \ 1953 // -H 'If-Modified-Since: Thu, 29 Mar 2018 22:36:32 GMT' \ 1954 // -v http://localhost:8080/favicon.ico > /dev/null 1955 // 1956 // which results in a 304 Not Modified. 1957 if err := gzw.Close(); err != nil && !errors.Is(err, http.ErrBodyNotAllowed) { 1958 ctx := s.AnnotateCtx(r.Context()) 1959 log.Warningf(ctx, "error closing gzip response writer: %v", err) 1960 } 1961 }() 1962 w = gzw 1963 } 1964 s.mux.ServeHTTP(w, r) 1965 } 1966 1967 // TempDir returns the filepath of the temporary directory used for temp storage. 1968 // It is empty for an in-memory temp storage. 1969 func (s *Server) TempDir() string { 1970 return s.cfg.TempStorageConfig.Path 1971 } 1972 1973 // PGServer exports the pgwire server. Used by tests. 1974 func (s *Server) PGServer() *pgwire.Server { 1975 return s.sqlServer.pgServer 1976 } 1977 1978 // TODO(benesch): Use https://github.com/NYTimes/gziphandler instead. 1979 // gzipResponseWriter reinvents the wheel and is not as robust. 1980 type gzipResponseWriter struct { 1981 gz gzip.Writer 1982 http.ResponseWriter 1983 } 1984 1985 func newGzipResponseWriter(rw http.ResponseWriter) *gzipResponseWriter { 1986 var w *gzipResponseWriter 1987 if wI := gzipResponseWriterPool.Get(); wI == nil { 1988 w = new(gzipResponseWriter) 1989 } else { 1990 w = wI.(*gzipResponseWriter) 1991 } 1992 w.Reset(rw) 1993 return w 1994 } 1995 1996 func (w *gzipResponseWriter) Reset(rw http.ResponseWriter) { 1997 w.gz.Reset(rw) 1998 w.ResponseWriter = rw 1999 } 2000 2001 func (w *gzipResponseWriter) Write(b []byte) (int, error) { 2002 // The underlying http.ResponseWriter can't sniff gzipped data properly, so we 2003 // do our own sniffing on the uncompressed data. 2004 if w.Header().Get("Content-Type") == "" { 2005 w.Header().Set("Content-Type", http.DetectContentType(b)) 2006 } 2007 return w.gz.Write(b) 2008 } 2009 2010 // Flush implements http.Flusher as required by grpc-gateway for clients 2011 // which access streaming endpoints (as exercised by the acceptance tests 2012 // at time of writing). 2013 func (w *gzipResponseWriter) Flush() { 2014 // If Flush returns an error, we'll see it on the next call to Write or 2015 // Close as well, so we can ignore it here. 2016 if err := w.gz.Flush(); err == nil { 2017 // Flush the wrapped ResponseWriter as well, if possible. 2018 if f, ok := w.ResponseWriter.(http.Flusher); ok { 2019 f.Flush() 2020 } 2021 } 2022 } 2023 2024 // Close implements the io.Closer interface. It is not safe to use the 2025 // writer after calling Close. 2026 func (w *gzipResponseWriter) Close() error { 2027 err := w.gz.Close() 2028 w.Reset(nil) // release ResponseWriter reference. 2029 gzipResponseWriterPool.Put(w) 2030 return err 2031 } 2032 2033 func init() { 2034 tracing.RegisterTagRemapping("n", "node") 2035 } 2036 2037 // configure attempts to set TCP keep-alive on 2038 // connection. Does not fail on errors. 2039 func (k *tcpKeepAliveManager) configure(ctx context.Context, conn net.Conn) { 2040 if k.tcpKeepAlive == 0 { 2041 return 2042 } 2043 2044 muxConn, ok := conn.(*cmux.MuxConn) 2045 if !ok { 2046 return 2047 } 2048 tcpConn, ok := muxConn.Conn.(*net.TCPConn) 2049 if !ok { 2050 return 2051 } 2052 2053 // Only log success/failure once. 2054 doLog := atomic.CompareAndSwapInt32(&k.loggedKeepAliveStatus, 0, 1) 2055 if err := tcpConn.SetKeepAlive(true); err != nil { 2056 if doLog { 2057 log.Warningf(ctx, "failed to enable TCP keep-alive for pgwire: %v", err) 2058 } 2059 return 2060 2061 } 2062 if err := tcpConn.SetKeepAlivePeriod(k.tcpKeepAlive); err != nil { 2063 if doLog { 2064 log.Warningf(ctx, "failed to set TCP keep-alive duration for pgwire: %v", err) 2065 } 2066 return 2067 } 2068 2069 if doLog { 2070 log.VEventf(ctx, 2, "setting TCP keep-alive to %s for pgwire", k.tcpKeepAlive) 2071 } 2072 } 2073 2074 type tcpKeepAliveManager struct { 2075 // The keepalive duration. 2076 tcpKeepAlive time.Duration 2077 // loggedKeepAliveStatus ensures that errors about setting the TCP 2078 // keepalive status are only reported once. 2079 loggedKeepAliveStatus int32 2080 } 2081 2082 func listen( 2083 ctx context.Context, addr, advertiseAddr *string, connName string, 2084 ) (net.Listener, error) { 2085 ln, err := net.Listen("tcp", *addr) 2086 if err != nil { 2087 return nil, &ListenError{ 2088 cause: err, 2089 Addr: *addr, 2090 } 2091 } 2092 if err := base.UpdateAddrs(ctx, addr, advertiseAddr, ln.Addr()); err != nil { 2093 return nil, errors.Wrapf(err, "internal error: cannot parse %s listen address", connName) 2094 } 2095 return ln, nil 2096 } 2097 2098 // RunLocalSQL calls fn on a SQL internal executor on this server. 2099 // This is meant for use for SQL initialization during bootstrapping. 2100 // 2101 // The internal SQL interface should be used instead of a regular SQL 2102 // network connection for SQL initializations when setting up a new 2103 // server, because it is possible for the server to listen on a 2104 // network interface that is not reachable from loopback. It is also 2105 // possible for the TLS certificates to be invalid when used locally 2106 // (e.g. if the hostname in the cert is an advertised address that's 2107 // only reachable externally). 2108 func (s *Server) RunLocalSQL( 2109 ctx context.Context, fn func(ctx context.Context, sqlExec *sql.InternalExecutor) error, 2110 ) error { 2111 return fn(ctx, s.sqlServer.internalExecutor) 2112 }