github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/server/node.go (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package server 12 13 import ( 14 "context" 15 "fmt" 16 "net" 17 "sort" 18 "time" 19 20 "github.com/cockroachdb/cockroach/pkg/base" 21 "github.com/cockroachdb/cockroach/pkg/build" 22 "github.com/cockroachdb/cockroach/pkg/clusterversion" 23 "github.com/cockroachdb/cockroach/pkg/config" 24 "github.com/cockroachdb/cockroach/pkg/config/zonepb" 25 "github.com/cockroachdb/cockroach/pkg/gossip" 26 "github.com/cockroachdb/cockroach/pkg/keys" 27 "github.com/cockroachdb/cockroach/pkg/kv" 28 "github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord" 29 "github.com/cockroachdb/cockroach/pkg/kv/kvserver" 30 "github.com/cockroachdb/cockroach/pkg/roachpb" 31 "github.com/cockroachdb/cockroach/pkg/server/status" 32 "github.com/cockroachdb/cockroach/pkg/settings" 33 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 34 "github.com/cockroachdb/cockroach/pkg/sql" 35 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 36 "github.com/cockroachdb/cockroach/pkg/storage" 37 "github.com/cockroachdb/cockroach/pkg/util" 38 "github.com/cockroachdb/cockroach/pkg/util/growstack" 39 "github.com/cockroachdb/cockroach/pkg/util/grpcutil" 40 "github.com/cockroachdb/cockroach/pkg/util/hlc" 41 "github.com/cockroachdb/cockroach/pkg/util/log" 42 "github.com/cockroachdb/cockroach/pkg/util/metric" 43 "github.com/cockroachdb/cockroach/pkg/util/retry" 44 "github.com/cockroachdb/cockroach/pkg/util/stop" 45 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 46 "github.com/cockroachdb/cockroach/pkg/util/tracing" 47 "github.com/cockroachdb/cockroach/pkg/util/uuid" 48 "github.com/cockroachdb/errors" 49 "github.com/cockroachdb/logtags" 50 opentracing "github.com/opentracing/opentracing-go" 51 ) 52 53 const ( 54 // gossipStatusInterval is the interval for logging gossip status. 55 gossipStatusInterval = 1 * time.Minute 56 57 // FirstNodeID is the node ID of the first node in a new cluster. 58 FirstNodeID = 1 59 graphiteIntervalKey = "external.graphite.interval" 60 maxGraphiteInterval = 15 * time.Minute 61 ) 62 63 // Metric names. 64 var ( 65 metaExecLatency = metric.Metadata{ 66 Name: "exec.latency", 67 Help: "Latency of batch KV requests executed on this node", 68 Measurement: "Latency", 69 Unit: metric.Unit_NANOSECONDS, 70 } 71 metaExecSuccess = metric.Metadata{ 72 Name: "exec.success", 73 Help: "Number of batch KV requests executed successfully on this node", 74 Measurement: "Batch KV Requests", 75 Unit: metric.Unit_COUNT, 76 } 77 metaExecError = metric.Metadata{ 78 Name: "exec.error", 79 Help: "Number of batch KV requests that failed to execute on this node", 80 Measurement: "Batch KV Requests", 81 Unit: metric.Unit_COUNT, 82 } 83 84 metaDiskStalls = metric.Metadata{ 85 Name: "engine.stalls", 86 Help: "Number of disk stalls detected on this node", 87 Measurement: "Disk stalls detected", 88 Unit: metric.Unit_COUNT, 89 } 90 ) 91 92 // Cluster settings. 93 var ( 94 // graphiteEndpoint is host:port, if any, of Graphite metrics server. 95 graphiteEndpoint = settings.RegisterPublicStringSetting( 96 "external.graphite.endpoint", 97 "if nonempty, push server metrics to the Graphite or Carbon server at the specified host:port", 98 "", 99 ) 100 // graphiteInterval is how often metrics are pushed to Graphite, if enabled. 101 graphiteInterval = settings.RegisterPublicNonNegativeDurationSettingWithMaximum( 102 graphiteIntervalKey, 103 "the interval at which metrics are pushed to Graphite (if enabled)", 104 10*time.Second, 105 maxGraphiteInterval, 106 ) 107 ) 108 109 type nodeMetrics struct { 110 Latency *metric.Histogram 111 Success *metric.Counter 112 Err *metric.Counter 113 DiskStalls *metric.Counter 114 } 115 116 func makeNodeMetrics(reg *metric.Registry, histogramWindow time.Duration) nodeMetrics { 117 nm := nodeMetrics{ 118 Latency: metric.NewLatency(metaExecLatency, histogramWindow), 119 Success: metric.NewCounter(metaExecSuccess), 120 Err: metric.NewCounter(metaExecError), 121 DiskStalls: metric.NewCounter(metaDiskStalls), 122 } 123 reg.AddMetricStruct(nm) 124 return nm 125 } 126 127 // callComplete records very high-level metrics about the number of completed 128 // calls and their latency. Currently, this only records statistics at the batch 129 // level; stats on specific lower-level kv operations are not recorded. 130 func (nm nodeMetrics) callComplete(d time.Duration, pErr *roachpb.Error) { 131 if pErr != nil && pErr.TransactionRestart == roachpb.TransactionRestart_NONE { 132 nm.Err.Inc(1) 133 } else { 134 nm.Success.Inc(1) 135 } 136 nm.Latency.RecordValue(d.Nanoseconds()) 137 } 138 139 // A Node manages a map of stores (by store ID) for which it serves 140 // traffic. A node is the top-level data structure. There is one node 141 // instance per process. A node accepts incoming RPCs and services 142 // them by directing the commands contained within RPCs to local 143 // stores, which in turn direct the commands to specific ranges. Each 144 // node has access to the global, monolithic Key-Value abstraction via 145 // its client.DB reference. Nodes use this to allocate node and store 146 // IDs for bootstrapping the node itself or new stores as they're added 147 // on subsequent instantiations. 148 type Node struct { 149 stopper *stop.Stopper 150 clusterID *base.ClusterIDContainer // UUID for Cockroach cluster 151 Descriptor roachpb.NodeDescriptor // Node ID, network/physical topology 152 storeCfg kvserver.StoreConfig // Config to use and pass to stores 153 eventLogger sql.EventLogger 154 stores *kvserver.Stores // Access to node-local stores 155 metrics nodeMetrics 156 recorder *status.MetricsRecorder 157 startedAt int64 158 lastUp int64 159 initialBoot bool // True if this is the first time this node has started. 160 txnMetrics kvcoord.TxnMetrics 161 162 perReplicaServer kvserver.Server 163 } 164 165 // allocateNodeID increments the node id generator key to allocate 166 // a new, unique node id. 167 func allocateNodeID(ctx context.Context, db *kv.DB) (roachpb.NodeID, error) { 168 val, err := kv.IncrementValRetryable(ctx, db, keys.NodeIDGenerator, 1) 169 if err != nil { 170 return 0, errors.Wrap(err, "unable to allocate node ID") 171 } 172 return roachpb.NodeID(val), nil 173 } 174 175 // allocateStoreIDs increments the store id generator key for the 176 // specified node to allocate count new, unique store ids. The 177 // first ID in a contiguous range is returned on success. 178 func allocateStoreIDs( 179 ctx context.Context, nodeID roachpb.NodeID, count int64, db *kv.DB, 180 ) (roachpb.StoreID, error) { 181 val, err := kv.IncrementValRetryable(ctx, db, keys.StoreIDGenerator, count) 182 if err != nil { 183 return 0, errors.Wrapf(err, "unable to allocate %d store IDs for node %d", count, nodeID) 184 } 185 return roachpb.StoreID(val - count + 1), nil 186 } 187 188 // GetBootstrapSchema returns the schema which will be used to bootstrap a new 189 // server. 190 func GetBootstrapSchema( 191 defaultZoneConfig *zonepb.ZoneConfig, defaultSystemZoneConfig *zonepb.ZoneConfig, 192 ) sqlbase.MetadataSchema { 193 return sqlbase.MakeMetadataSchema(keys.SystemSQLCodec, defaultZoneConfig, defaultSystemZoneConfig) 194 } 195 196 // bootstrapCluster initializes the passed-in engines for a new cluster. 197 // Returns the cluster ID. 198 // 199 // The first engine will contain ranges for various static split points (i.e. 200 // various system ranges and system tables). Note however that many of these 201 // ranges cannot be accessed by KV in regular means until the node liveness is 202 // written, since epoch-based leases cannot be granted until then. All other 203 // engines are initialized with their StoreIdent. 204 func bootstrapCluster( 205 ctx context.Context, 206 engines []storage.Engine, 207 defaultZoneConfig *zonepb.ZoneConfig, 208 defaultSystemZoneConfig *zonepb.ZoneConfig, 209 ) (*initState, error) { 210 clusterID := uuid.MakeV4() 211 // TODO(andrei): It'd be cool if this method wouldn't do anything to engines 212 // other than the first one, and let regular node startup code deal with them. 213 var bootstrapVersion clusterversion.ClusterVersion 214 for i, eng := range engines { 215 cv, err := kvserver.ReadClusterVersion(ctx, eng) 216 if err != nil { 217 return nil, errors.Wrapf(err, "reading cluster version of %s", eng) 218 } else if cv.Major == 0 { 219 return nil, errors.Errorf("missing bootstrap version") 220 } 221 222 // bootstrapCluster requires matching cluster versions on all engines. 223 if i == 0 { 224 bootstrapVersion = cv 225 } else if bootstrapVersion != cv { 226 return nil, errors.Wrapf(err, "found cluster versions %s and %s", bootstrapVersion, cv) 227 } 228 229 sIdent := roachpb.StoreIdent{ 230 ClusterID: clusterID, 231 NodeID: FirstNodeID, 232 StoreID: roachpb.StoreID(i + 1), 233 } 234 235 // Initialize the engine backing the store with the store ident and cluster 236 // version. 237 if err := kvserver.InitEngine(ctx, eng, sIdent); err != nil { 238 return nil, err 239 } 240 241 // Create first range, writing directly to engine. Note this does 242 // not create the range, just its data. Only do this if this is the 243 // first store. 244 if i == 0 { 245 schema := GetBootstrapSchema(defaultZoneConfig, defaultSystemZoneConfig) 246 initialValues, tableSplits := schema.GetInitialValues() 247 splits := append(config.StaticSplits(), tableSplits...) 248 sort.Slice(splits, func(i, j int) bool { 249 return splits[i].Less(splits[j]) 250 }) 251 252 if err := kvserver.WriteInitialClusterData( 253 ctx, eng, initialValues, 254 bootstrapVersion.Version, len(engines), splits, 255 hlc.UnixNano(), 256 ); err != nil { 257 return nil, err 258 } 259 } 260 } 261 262 state := &initState{ 263 initDiskState: initDiskState{ 264 nodeID: FirstNodeID, 265 clusterID: clusterID, 266 clusterVersion: bootstrapVersion, 267 initializedEngines: engines, 268 newEngines: nil, 269 }, 270 joined: true, 271 } 272 return state, nil 273 } 274 275 // NewNode returns a new instance of Node. 276 // 277 // execCfg can be nil to help bootstrapping of a Server (the Node is created 278 // before the ExecutorConfig is initialized). In that case, InitLogger() needs 279 // to be called before the Node is used. 280 func NewNode( 281 cfg kvserver.StoreConfig, 282 recorder *status.MetricsRecorder, 283 reg *metric.Registry, 284 stopper *stop.Stopper, 285 txnMetrics kvcoord.TxnMetrics, 286 execCfg *sql.ExecutorConfig, 287 clusterID *base.ClusterIDContainer, 288 ) *Node { 289 var eventLogger sql.EventLogger 290 if execCfg != nil { 291 eventLogger = sql.MakeEventLogger(execCfg) 292 } 293 n := &Node{ 294 storeCfg: cfg, 295 stopper: stopper, 296 recorder: recorder, 297 metrics: makeNodeMetrics(reg, cfg.HistogramWindowInterval), 298 stores: kvserver.NewStores(cfg.AmbientCtx, cfg.Clock), 299 txnMetrics: txnMetrics, 300 eventLogger: eventLogger, 301 clusterID: clusterID, 302 } 303 n.perReplicaServer = kvserver.MakeServer(&n.Descriptor, n.stores) 304 return n 305 } 306 307 // InitLogger needs to be called if a nil execCfg was passed to NewNode(). 308 func (n *Node) InitLogger(execCfg *sql.ExecutorConfig) { 309 n.eventLogger = sql.MakeEventLogger(execCfg) 310 } 311 312 // String implements fmt.Stringer. 313 func (n *Node) String() string { 314 return fmt.Sprintf("node=%d", n.Descriptor.NodeID) 315 } 316 317 // AnnotateCtx is a convenience wrapper; see AmbientContext. 318 func (n *Node) AnnotateCtx(ctx context.Context) context.Context { 319 return n.storeCfg.AmbientCtx.AnnotateCtx(ctx) 320 } 321 322 // AnnotateCtxWithSpan is a convenience wrapper; see AmbientContext. 323 func (n *Node) AnnotateCtxWithSpan( 324 ctx context.Context, opName string, 325 ) (context.Context, opentracing.Span) { 326 return n.storeCfg.AmbientCtx.AnnotateCtxWithSpan(ctx, opName) 327 } 328 329 // start starts the node by registering the storage instance for the 330 // RPC service "Node" and initializing stores for each specified 331 // engine. Launches periodic store gossiping in a goroutine. 332 // A callback can be optionally provided that will be invoked once this node's 333 // NodeDescriptor is available, to help bootstrapping. 334 func (n *Node) start( 335 ctx context.Context, 336 addr, sqlAddr net.Addr, 337 state initState, 338 clusterName string, 339 attrs roachpb.Attributes, 340 locality roachpb.Locality, 341 localityAddress []roachpb.LocalityAddress, 342 nodeDescriptorCallback func(descriptor roachpb.NodeDescriptor), 343 ) error { 344 // Obtaining the NodeID requires a dance of sorts. If the node has initialized 345 // stores, the NodeID is persisted in each of them. If not, then we'll need to 346 // use the KV store to get a NodeID assigned. 347 n.initialBoot = state.joined 348 nodeID := state.nodeID 349 if nodeID == 0 { 350 if !state.joined { 351 log.Fatalf(ctx, "node has no NodeID, but claims to not be joining cluster") 352 } 353 // Allocate NodeID. Note that Gossip is already connected because if there's 354 // no NodeID yet, this means that we had to connect Gossip to learn the ClusterID. 355 select { 356 case <-n.storeCfg.Gossip.Connected: 357 default: 358 log.Fatalf(ctx, "Gossip is not connected yet") 359 } 360 ctxWithSpan, span := n.AnnotateCtxWithSpan(ctx, "alloc-node-id") 361 newID, err := allocateNodeID(ctxWithSpan, n.storeCfg.DB) 362 if err != nil { 363 return err 364 } 365 log.Infof(ctxWithSpan, "new node allocated ID %d", newID) 366 span.Finish() 367 nodeID = newID 368 } 369 370 // Inform the RPC context of the node ID. 371 n.storeCfg.RPCContext.NodeID.Set(ctx, nodeID) 372 373 n.startedAt = n.storeCfg.Clock.Now().WallTime 374 n.Descriptor = roachpb.NodeDescriptor{ 375 NodeID: nodeID, 376 Address: util.MakeUnresolvedAddr(addr.Network(), addr.String()), 377 SQLAddress: util.MakeUnresolvedAddr(sqlAddr.Network(), sqlAddr.String()), 378 Attrs: attrs, 379 Locality: locality, 380 LocalityAddress: localityAddress, 381 ClusterName: clusterName, 382 ServerVersion: n.storeCfg.Settings.Version.BinaryVersion(), 383 BuildTag: build.GetInfo().Tag, 384 StartedAt: n.startedAt, 385 } 386 // Invoke any passed in nodeDescriptorCallback as soon as it's available, to 387 // ensure that other components (currently the DistSQLPlanner) are initialized 388 // before store startup continues. 389 if nodeDescriptorCallback != nil { 390 nodeDescriptorCallback(n.Descriptor) 391 } 392 393 // Gossip the node descriptor to make this node addressable by node ID. 394 n.storeCfg.Gossip.NodeID.Set(ctx, n.Descriptor.NodeID) 395 if err := n.storeCfg.Gossip.SetNodeDescriptor(&n.Descriptor); err != nil { 396 return errors.Errorf("couldn't gossip descriptor for node %d: %s", n.Descriptor.NodeID, err) 397 } 398 399 // Start the closed timestamp subsystem. 400 n.storeCfg.ClosedTimestamp.Start(n.Descriptor.NodeID) 401 402 // Create stores from the engines that were already bootstrapped. 403 for _, e := range state.initializedEngines { 404 s := kvserver.NewStore(ctx, n.storeCfg, e, &n.Descriptor) 405 if err := s.Start(ctx, n.stopper); err != nil { 406 return errors.Errorf("failed to start store: %s", err) 407 } 408 capacity, err := s.Capacity(false /* useCached */) 409 if err != nil { 410 return errors.Errorf("could not query store capacity: %s", err) 411 } 412 log.Infof(ctx, "initialized store %s: %+v", s, capacity) 413 414 n.addStore(s) 415 } 416 417 // Verify all initialized stores agree on cluster and node IDs. 418 if err := n.validateStores(ctx); err != nil { 419 return err 420 } 421 log.VEventf(ctx, 2, "validated stores") 422 423 // Compute the time this node was last up; this is done by reading the 424 // "last up time" from every store and choosing the most recent timestamp. 425 var mostRecentTimestamp hlc.Timestamp 426 if err := n.stores.VisitStores(func(s *kvserver.Store) error { 427 timestamp, err := s.ReadLastUpTimestamp(ctx) 428 if err != nil { 429 return err 430 } 431 if mostRecentTimestamp.Less(timestamp) { 432 mostRecentTimestamp = timestamp 433 } 434 return nil 435 }); err != nil { 436 return errors.Wrapf(err, "failed to read last up timestamp from stores") 437 } 438 n.lastUp = mostRecentTimestamp.WallTime 439 440 // Set the stores map as the gossip persistent storage, so that 441 // gossip can bootstrap using the most recently persisted set of 442 // node addresses. 443 if err := n.storeCfg.Gossip.SetStorage(n.stores); err != nil { 444 return fmt.Errorf("failed to initialize the gossip interface: %s", err) 445 } 446 447 // Bootstrap any uninitialized stores. 448 // 449 // TODO(tbg): address https://github.com/cockroachdb/cockroach/issues/39415. 450 // Should be easy enough. Writing the test is probably most of the work. 451 if len(state.newEngines) > 0 { 452 if err := n.bootstrapStores(ctx, state.newEngines, n.stopper); err != nil { 453 return err 454 } 455 } 456 457 n.startComputePeriodicMetrics(n.stopper, base.DefaultMetricsSampleInterval) 458 459 // Be careful about moving this line above `startStores`; store migrations rely 460 // on the fact that the cluster version has not been updated via Gossip (we 461 // have migrations that want to run only if the server starts with a given 462 // cluster version, but not if the server starts with a lower one and gets 463 // bumped immediately, which would be possible if gossip got started earlier). 464 n.startGossip(ctx, n.stopper) 465 466 allEngines := append([]storage.Engine(nil), state.initializedEngines...) 467 allEngines = append(allEngines, state.newEngines...) 468 log.Infof(ctx, "%s: started with %v engine(s) and attributes %v", n, allEngines, attrs.Attrs) 469 return nil 470 } 471 472 // IsDraining returns true if at least one Store housed on this Node is not 473 // currently allowing range leases to be procured or extended. 474 func (n *Node) IsDraining() bool { 475 var isDraining bool 476 if err := n.stores.VisitStores(func(s *kvserver.Store) error { 477 isDraining = isDraining || s.IsDraining() 478 return nil 479 }); err != nil { 480 panic(err) 481 } 482 return isDraining 483 } 484 485 // SetDraining sets the draining mode on all of the node's underlying stores. 486 // The reporter callback, if non-nil, is called on a best effort basis 487 // to report work that needed to be done and which may or may not have 488 // been done by the time this call returns. See the explanation in 489 // pkg/server/drain.go for details. 490 func (n *Node) SetDraining(drain bool, reporter func(int, string)) error { 491 return n.stores.VisitStores(func(s *kvserver.Store) error { 492 s.SetDraining(drain, reporter) 493 return nil 494 }) 495 } 496 497 // SetHLCUpperBound sets the upper bound of the HLC wall time on all of the 498 // node's underlying stores. 499 func (n *Node) SetHLCUpperBound(ctx context.Context, hlcUpperBound int64) error { 500 return n.stores.VisitStores(func(s *kvserver.Store) error { 501 return s.WriteHLCUpperBound(ctx, hlcUpperBound) 502 }) 503 } 504 505 func (n *Node) addStore(store *kvserver.Store) { 506 cv, err := store.GetClusterVersion(context.TODO()) 507 if err != nil { 508 log.Fatalf(context.TODO(), "%v", err) 509 } 510 if cv == (clusterversion.ClusterVersion{}) { 511 // The store should have had a version written to it during the store 512 // bootstrap process. 513 log.Fatal(context.TODO(), "attempting to add a store without a version") 514 } 515 n.stores.AddStore(store) 516 n.recorder.AddStore(store) 517 } 518 519 // validateStores iterates over all stores, verifying they agree on node ID. 520 // The node's ident is initialized based on the agreed-upon node ID. Note that 521 // cluster ID consistency is checked elsewhere in inspectEngines. 522 // 523 // TODO(tbg): remove this, we already validate everything in inspectEngines now. 524 func (n *Node) validateStores(ctx context.Context) error { 525 return n.stores.VisitStores(func(s *kvserver.Store) error { 526 if n.Descriptor.NodeID != s.Ident.NodeID { 527 return errors.Errorf("store %s node ID doesn't match node ID: %d", s, n.Descriptor.NodeID) 528 } 529 return nil 530 }) 531 } 532 533 // bootstrapStores bootstraps uninitialized stores once the cluster 534 // and node IDs have been established for this node. Store IDs are 535 // allocated via a sequence id generator stored at a system key per 536 // node. The new stores are added to n.stores. 537 func (n *Node) bootstrapStores( 538 ctx context.Context, emptyEngines []storage.Engine, stopper *stop.Stopper, 539 ) error { 540 if n.clusterID.Get() == uuid.Nil { 541 return errors.New("ClusterID missing during store bootstrap of auxiliary store") 542 } 543 544 { 545 // Bootstrap all waiting stores by allocating a new store id for 546 // each and invoking storage.Bootstrap() to persist it and the cluster 547 // version and to create stores. 548 inc := int64(len(emptyEngines)) 549 firstID, err := allocateStoreIDs(ctx, n.Descriptor.NodeID, inc, n.storeCfg.DB) 550 if err != nil { 551 return errors.Errorf("error allocating store ids: %s", err) 552 } 553 sIdent := roachpb.StoreIdent{ 554 ClusterID: n.clusterID.Get(), 555 NodeID: n.Descriptor.NodeID, 556 StoreID: firstID, 557 } 558 for _, eng := range emptyEngines { 559 if err := kvserver.InitEngine(ctx, eng, sIdent); err != nil { 560 return err 561 } 562 563 s := kvserver.NewStore(ctx, n.storeCfg, eng, &n.Descriptor) 564 if err := s.Start(ctx, stopper); err != nil { 565 return err 566 } 567 n.addStore(s) 568 log.Infof(ctx, "bootstrapped store %s", s) 569 // Done regularly in Node.startGossip, but this cuts down the time 570 // until this store is used for range allocations. 571 if err := s.GossipStore(ctx, false /* useCached */); err != nil { 572 log.Warningf(ctx, "error doing initial gossiping: %s", err) 573 } 574 575 sIdent.StoreID++ 576 } 577 } 578 579 // write a new status summary after all stores have been bootstrapped; this 580 // helps the UI remain responsive when new nodes are added. 581 if err := n.writeNodeStatus(ctx, 0 /* alertTTL */); err != nil { 582 log.Warningf(ctx, "error writing node summary after store bootstrap: %s", err) 583 } 584 585 return nil 586 } 587 588 // startGossip loops on a periodic ticker to gossip node-related 589 // information. Starts a goroutine to loop until the node is closed. 590 func (n *Node) startGossip(ctx context.Context, stopper *stop.Stopper) { 591 ctx = n.AnnotateCtx(ctx) 592 stopper.RunWorker(ctx, func(ctx context.Context) { 593 // Verify we've already gossiped our node descriptor. 594 // 595 // TODO(tbg): see if we really needed to do this earlier already. We 596 // probably needed to (this call has to come late for ... reasons I 597 // still need to look into) and nobody can talk to this node until 598 // the descriptor is in Gossip. 599 if _, err := n.storeCfg.Gossip.GetNodeDescriptor(n.Descriptor.NodeID); err != nil { 600 panic(err) 601 } 602 603 // NB: Gossip may not be connected at this point. That's fine though, 604 // we can still gossip something; Gossip sends it out reactively once 605 // it can. 606 607 statusTicker := time.NewTicker(gossipStatusInterval) 608 storesTicker := time.NewTicker(gossip.StoresInterval) 609 nodeTicker := time.NewTicker(gossip.NodeDescriptorInterval) 610 defer storesTicker.Stop() 611 defer nodeTicker.Stop() 612 n.gossipStores(ctx) // one-off run before going to sleep 613 for { 614 select { 615 case <-statusTicker.C: 616 n.storeCfg.Gossip.LogStatus() 617 case <-storesTicker.C: 618 n.gossipStores(ctx) 619 case <-nodeTicker.C: 620 if err := n.storeCfg.Gossip.SetNodeDescriptor(&n.Descriptor); err != nil { 621 log.Warningf(ctx, "couldn't gossip descriptor for node %d: %s", n.Descriptor.NodeID, err) 622 } 623 case <-stopper.ShouldStop(): 624 return 625 } 626 } 627 }) 628 } 629 630 // gossipStores broadcasts each store and dead replica to the gossip network. 631 func (n *Node) gossipStores(ctx context.Context) { 632 if err := n.stores.VisitStores(func(s *kvserver.Store) error { 633 return s.GossipStore(ctx, false /* useCached */) 634 }); err != nil { 635 log.Warningf(ctx, "%v", err) 636 } 637 } 638 639 // startComputePeriodicMetrics starts a loop which periodically instructs each 640 // store to compute the value of metrics which cannot be incrementally 641 // maintained. 642 func (n *Node) startComputePeriodicMetrics(stopper *stop.Stopper, interval time.Duration) { 643 ctx := n.AnnotateCtx(context.Background()) 644 stopper.RunWorker(ctx, func(ctx context.Context) { 645 // Compute periodic stats at the same frequency as metrics are sampled. 646 ticker := time.NewTicker(interval) 647 defer ticker.Stop() 648 for tick := 0; ; tick++ { 649 select { 650 case <-ticker.C: 651 if err := n.computePeriodicMetrics(ctx, tick); err != nil { 652 log.Errorf(ctx, "failed computing periodic metrics: %s", err) 653 } 654 case <-stopper.ShouldStop(): 655 return 656 } 657 } 658 }) 659 } 660 661 // computePeriodicMetrics instructs each store to compute the value of 662 // complicated metrics. 663 func (n *Node) computePeriodicMetrics(ctx context.Context, tick int) error { 664 return n.stores.VisitStores(func(store *kvserver.Store) error { 665 if err := store.ComputeMetrics(ctx, tick); err != nil { 666 log.Warningf(ctx, "%s: unable to compute metrics: %s", store, err) 667 } 668 return nil 669 }) 670 } 671 672 func (n *Node) startGraphiteStatsExporter(st *cluster.Settings) { 673 ctx := logtags.AddTag(n.AnnotateCtx(context.Background()), "graphite stats exporter", nil) 674 pm := metric.MakePrometheusExporter() 675 676 n.stopper.RunWorker(ctx, func(ctx context.Context) { 677 var timer timeutil.Timer 678 defer timer.Stop() 679 for { 680 timer.Reset(graphiteInterval.Get(&st.SV)) 681 select { 682 case <-n.stopper.ShouldStop(): 683 return 684 case <-timer.C: 685 timer.Read = true 686 endpoint := graphiteEndpoint.Get(&st.SV) 687 if endpoint != "" { 688 if err := n.recorder.ExportToGraphite(ctx, endpoint, &pm); err != nil { 689 log.Infof(ctx, "error pushing metrics to graphite: %s\n", err) 690 } 691 } 692 } 693 } 694 }) 695 } 696 697 // startWriteNodeStatus begins periodically persisting status summaries for the 698 // node and its stores. 699 func (n *Node) startWriteNodeStatus(frequency time.Duration) { 700 ctx := logtags.AddTag(n.AnnotateCtx(context.Background()), "summaries", nil) 701 // Immediately record summaries once on server startup. 702 if err := n.writeNodeStatus(ctx, 0 /* alertTTL */); err != nil { 703 log.Warningf(ctx, "error recording initial status summaries: %s", err) 704 } 705 n.stopper.RunWorker(ctx, func(ctx context.Context) { 706 // Write a status summary immediately; this helps the UI remain 707 // responsive when new nodes are added. 708 ticker := time.NewTicker(frequency) 709 defer ticker.Stop() 710 for { 711 select { 712 case <-ticker.C: 713 // Use an alertTTL of twice the ticker frequency. This makes sure that 714 // alerts don't disappear and reappear spuriously while at the same 715 // time ensuring that an alert doesn't linger for too long after having 716 // resolved. 717 if err := n.writeNodeStatus(ctx, 2*frequency); err != nil { 718 log.Warningf(ctx, "error recording status summaries: %s", err) 719 } 720 case <-n.stopper.ShouldStop(): 721 return 722 } 723 } 724 }) 725 } 726 727 // writeNodeStatus retrieves status summaries from the supplied 728 // NodeStatusRecorder and persists them to the cockroach data store. 729 func (n *Node) writeNodeStatus(ctx context.Context, alertTTL time.Duration) error { 730 var err error 731 if runErr := n.stopper.RunTask(ctx, "node.Node: writing summary", func(ctx context.Context) { 732 nodeStatus := n.recorder.GenerateNodeStatus(ctx) 733 if nodeStatus == nil { 734 return 735 } 736 737 if result := n.recorder.CheckHealth(ctx, *nodeStatus); len(result.Alerts) != 0 { 738 var numNodes int 739 if err := n.storeCfg.Gossip.IterateInfos(gossip.KeyNodeIDPrefix, func(k string, info gossip.Info) error { 740 numNodes++ 741 return nil 742 }); err != nil { 743 log.Warningf(ctx, "%v", err) 744 } 745 if numNodes > 1 { 746 // Avoid this warning on single-node clusters, which require special UX. 747 log.Warningf(ctx, "health alerts detected: %+v", result) 748 } 749 if err := n.storeCfg.Gossip.AddInfoProto( 750 gossip.MakeNodeHealthAlertKey(n.Descriptor.NodeID), &result, alertTTL, 751 ); err != nil { 752 log.Warningf(ctx, "unable to gossip health alerts: %+v", result) 753 } 754 755 // TODO(tschottdorf): add a metric that we increment every time there are 756 // alerts. This can help understand how long the cluster has been in that 757 // state (since it'll be incremented every ~10s). 758 } 759 760 err = n.recorder.WriteNodeStatus(ctx, n.storeCfg.DB, *nodeStatus) 761 }); runErr != nil { 762 err = runErr 763 } 764 return err 765 } 766 767 // recordJoinEvent begins an asynchronous task which attempts to log a "node 768 // join" or "node restart" event. This query will retry until it succeeds or the 769 // server stops. 770 func (n *Node) recordJoinEvent() { 771 if !n.storeCfg.LogRangeEvents { 772 return 773 } 774 775 logEventType := sql.EventLogNodeRestart 776 lastUp := n.lastUp 777 if n.initialBoot { 778 logEventType = sql.EventLogNodeJoin 779 lastUp = n.startedAt 780 } 781 782 n.stopper.RunWorker(context.Background(), func(bgCtx context.Context) { 783 ctx, span := n.AnnotateCtxWithSpan(bgCtx, "record-join-event") 784 defer span.Finish() 785 retryOpts := base.DefaultRetryOptions() 786 retryOpts.Closer = n.stopper.ShouldStop() 787 for r := retry.Start(retryOpts); r.Next(); { 788 if err := n.storeCfg.DB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 789 return n.eventLogger.InsertEventRecord( 790 ctx, 791 txn, 792 logEventType, 793 int32(n.Descriptor.NodeID), 794 int32(n.Descriptor.NodeID), 795 struct { 796 Descriptor roachpb.NodeDescriptor 797 ClusterID uuid.UUID 798 StartedAt int64 799 LastUp int64 800 }{n.Descriptor, n.clusterID.Get(), n.startedAt, lastUp}, 801 ) 802 }); err != nil { 803 log.Warningf(ctx, "%s: unable to log %s event: %s", n, logEventType, err) 804 } else { 805 return 806 } 807 } 808 }) 809 } 810 811 // If we receive a (proto-marshaled) roachpb.BatchRequest whose Requests contain 812 // a message type unknown to this node, we will end up with a zero entry in the 813 // slice. If we don't error out early, this breaks all sorts of assumptions and 814 // usually ends in a panic. 815 func checkNoUnknownRequest(reqs []roachpb.RequestUnion) *roachpb.UnsupportedRequestError { 816 for _, req := range reqs { 817 if req.GetValue() == nil { 818 return &roachpb.UnsupportedRequestError{} 819 } 820 } 821 return nil 822 } 823 824 func (n *Node) batchInternal( 825 ctx context.Context, args *roachpb.BatchRequest, 826 ) (*roachpb.BatchResponse, error) { 827 if detail := checkNoUnknownRequest(args.Requests); detail != nil { 828 var br roachpb.BatchResponse 829 br.Error = roachpb.NewError(detail) 830 return &br, nil 831 } 832 833 var br *roachpb.BatchResponse 834 if err := n.stopper.RunTaskWithErr(ctx, "node.Node: batch", func(ctx context.Context) error { 835 var finishSpan func(*roachpb.BatchResponse) 836 // Shadow ctx from the outer function. Written like this to pass the linter. 837 ctx, finishSpan = n.setupSpanForIncomingRPC(ctx, grpcutil.IsLocalRequestContext(ctx)) 838 // NB: wrapped to delay br evaluation to its value when returning. 839 defer func() { finishSpan(br) }() 840 if log.HasSpanOrEvent(ctx) { 841 log.Eventf(ctx, "node received request: %s", args.Summary()) 842 } 843 844 tStart := timeutil.Now() 845 var pErr *roachpb.Error 846 br, pErr = n.stores.Send(ctx, *args) 847 if pErr != nil { 848 br = &roachpb.BatchResponse{} 849 log.VErrEventf(ctx, 3, "%T", pErr.GetDetail()) 850 } 851 if br.Error != nil { 852 panic(roachpb.ErrorUnexpectedlySet(n.stores, br)) 853 } 854 n.metrics.callComplete(timeutil.Since(tStart), pErr) 855 br.Error = pErr 856 return nil 857 }); err != nil { 858 return nil, err 859 } 860 return br, nil 861 } 862 863 // Batch implements the roachpb.InternalServer interface. 864 func (n *Node) Batch( 865 ctx context.Context, args *roachpb.BatchRequest, 866 ) (*roachpb.BatchResponse, error) { 867 // NB: Node.Batch is called directly for "local" calls. We don't want to 868 // carry the associated log tags forward as doing so makes adding additional 869 // log tags more expensive and makes local calls differ from remote calls. 870 ctx = n.storeCfg.AmbientCtx.ResetAndAnnotateCtx(ctx) 871 872 br, err := n.batchInternal(ctx, args) 873 874 // We always return errors via BatchResponse.Error so structure is 875 // preserved; plain errors are presumed to be from the RPC 876 // framework and not from cockroach. 877 if err != nil { 878 if br == nil { 879 br = &roachpb.BatchResponse{} 880 } 881 if br.Error != nil { 882 log.Fatalf( 883 ctx, "attempting to return both a plain error (%s) and roachpb.Error (%s)", err, br.Error, 884 ) 885 } 886 br.Error = roachpb.NewError(err) 887 } 888 return br, nil 889 } 890 891 // setupSpanForIncomingRPC takes a context and returns a derived context with a 892 // new span in it. Depending on the input context, that span might be a root 893 // span or a child span. If it is a child span, it might be a child span of a 894 // local or a remote span. Note that supporting both the "child of local span" 895 // and "child of remote span" cases are important, as this RPC can be called 896 // either through the network or directly if the caller is local. 897 // 898 // It returns the derived context and a cleanup function to be called when 899 // servicing the RPC is done. The cleanup function will close the span and, in 900 // case the span was the child of a remote span and "snowball tracing" was 901 // enabled on that parent span, it serializes the local trace into the 902 // BatchResponse. The cleanup function takes the BatchResponse in which the 903 // response is to serialized. The BatchResponse can be nil in case no response 904 // is to be returned to the rpc caller. 905 func (n *Node) setupSpanForIncomingRPC( 906 ctx context.Context, isLocalRequest bool, 907 ) (context.Context, func(*roachpb.BatchResponse)) { 908 // The operation name matches the one created by the interceptor in the 909 // remoteTrace case below. 910 const opName = "/cockroach.roachpb.Internal/Batch" 911 var newSpan, grpcSpan opentracing.Span 912 if isLocalRequest { 913 // This is a local request which circumvented gRPC. Start a span now. 914 ctx, newSpan = tracing.ChildSpan(ctx, opName) 915 } else { 916 grpcSpan = opentracing.SpanFromContext(ctx) 917 if grpcSpan == nil { 918 // If tracing information was passed via gRPC metadata, the gRPC interceptor 919 // should have opened a span for us. If not, open a span now (if tracing is 920 // disabled, this will be a noop span). 921 newSpan = n.storeCfg.AmbientCtx.Tracer.(*tracing.Tracer).StartRootSpan( 922 opName, n.storeCfg.AmbientCtx.LogTags(), tracing.NonRecordableSpan, 923 ) 924 ctx = opentracing.ContextWithSpan(ctx, newSpan) 925 } else { 926 grpcSpan.SetTag("node", n.Descriptor.NodeID) 927 } 928 } 929 930 finishSpan := func(br *roachpb.BatchResponse) { 931 if newSpan != nil { 932 newSpan.Finish() 933 } 934 if br == nil { 935 return 936 } 937 if grpcSpan != nil { 938 // If this is a "snowball trace", we'll need to return all the recorded 939 // spans in the BatchResponse at the end of the request. 940 // We don't want to do this if the operation is on the same host, in which 941 // case everything is already part of the same recording. 942 if rec := tracing.GetRecording(grpcSpan); rec != nil { 943 br.CollectedSpans = append(br.CollectedSpans, rec...) 944 } 945 } 946 } 947 return ctx, finishSpan 948 } 949 950 // RangeFeed implements the roachpb.InternalServer interface. 951 func (n *Node) RangeFeed( 952 args *roachpb.RangeFeedRequest, stream roachpb.Internal_RangeFeedServer, 953 ) error { 954 growstack.Grow() 955 956 pErr := n.stores.RangeFeed(args, stream) 957 if pErr != nil { 958 var event roachpb.RangeFeedEvent 959 event.SetValue(&roachpb.RangeFeedError{ 960 Error: *pErr, 961 }) 962 return stream.Send(&event) 963 } 964 return nil 965 }