github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/server/node.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/server/node.go (about)

     1  // Copyright 2014 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package server
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"net"
    17  	"sort"
    18  	"time"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/base"
    21  	"github.com/cockroachdb/cockroach/pkg/build"
    22  	"github.com/cockroachdb/cockroach/pkg/clusterversion"
    23  	"github.com/cockroachdb/cockroach/pkg/config"
    24  	"github.com/cockroachdb/cockroach/pkg/config/zonepb"
    25  	"github.com/cockroachdb/cockroach/pkg/gossip"
    26  	"github.com/cockroachdb/cockroach/pkg/keys"
    27  	"github.com/cockroachdb/cockroach/pkg/kv"
    28  	"github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord"
    29  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
    30  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    31  	"github.com/cockroachdb/cockroach/pkg/server/status"
    32  	"github.com/cockroachdb/cockroach/pkg/settings"
    33  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    34  	"github.com/cockroachdb/cockroach/pkg/sql"
    35  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    36  	"github.com/cockroachdb/cockroach/pkg/storage"
    37  	"github.com/cockroachdb/cockroach/pkg/util"
    38  	"github.com/cockroachdb/cockroach/pkg/util/growstack"
    39  	"github.com/cockroachdb/cockroach/pkg/util/grpcutil"
    40  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    41  	"github.com/cockroachdb/cockroach/pkg/util/log"
    42  	"github.com/cockroachdb/cockroach/pkg/util/metric"
    43  	"github.com/cockroachdb/cockroach/pkg/util/retry"
    44  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    45  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    46  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    47  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    48  	"github.com/cockroachdb/errors"
    49  	"github.com/cockroachdb/logtags"
    50  	opentracing "github.com/opentracing/opentracing-go"
    51  )
    52  
    53  const (
    54  	// gossipStatusInterval is the interval for logging gossip status.
    55  	gossipStatusInterval = 1 * time.Minute
    56  
    57  	// FirstNodeID is the node ID of the first node in a new cluster.
    58  	FirstNodeID         = 1
    59  	graphiteIntervalKey = "external.graphite.interval"
    60  	maxGraphiteInterval = 15 * time.Minute
    61  )
    62  
    63  // Metric names.
    64  var (
    65  	metaExecLatency = metric.Metadata{
    66  		Name:        "exec.latency",
    67  		Help:        "Latency of batch KV requests executed on this node",
    68  		Measurement: "Latency",
    69  		Unit:        metric.Unit_NANOSECONDS,
    70  	}
    71  	metaExecSuccess = metric.Metadata{
    72  		Name:        "exec.success",
    73  		Help:        "Number of batch KV requests executed successfully on this node",
    74  		Measurement: "Batch KV Requests",
    75  		Unit:        metric.Unit_COUNT,
    76  	}
    77  	metaExecError = metric.Metadata{
    78  		Name:        "exec.error",
    79  		Help:        "Number of batch KV requests that failed to execute on this node",
    80  		Measurement: "Batch KV Requests",
    81  		Unit:        metric.Unit_COUNT,
    82  	}
    83  
    84  	metaDiskStalls = metric.Metadata{
    85  		Name:        "engine.stalls",
    86  		Help:        "Number of disk stalls detected on this node",
    87  		Measurement: "Disk stalls detected",
    88  		Unit:        metric.Unit_COUNT,
    89  	}
    90  )
    91  
    92  // Cluster settings.
    93  var (
    94  	// graphiteEndpoint is host:port, if any, of Graphite metrics server.
    95  	graphiteEndpoint = settings.RegisterPublicStringSetting(
    96  		"external.graphite.endpoint",
    97  		"if nonempty, push server metrics to the Graphite or Carbon server at the specified host:port",
    98  		"",
    99  	)
   100  	// graphiteInterval is how often metrics are pushed to Graphite, if enabled.
   101  	graphiteInterval = settings.RegisterPublicNonNegativeDurationSettingWithMaximum(
   102  		graphiteIntervalKey,
   103  		"the interval at which metrics are pushed to Graphite (if enabled)",
   104  		10*time.Second,
   105  		maxGraphiteInterval,
   106  	)
   107  )
   108  
   109  type nodeMetrics struct {
   110  	Latency    *metric.Histogram
   111  	Success    *metric.Counter
   112  	Err        *metric.Counter
   113  	DiskStalls *metric.Counter
   114  }
   115  
   116  func makeNodeMetrics(reg *metric.Registry, histogramWindow time.Duration) nodeMetrics {
   117  	nm := nodeMetrics{
   118  		Latency:    metric.NewLatency(metaExecLatency, histogramWindow),
   119  		Success:    metric.NewCounter(metaExecSuccess),
   120  		Err:        metric.NewCounter(metaExecError),
   121  		DiskStalls: metric.NewCounter(metaDiskStalls),
   122  	}
   123  	reg.AddMetricStruct(nm)
   124  	return nm
   125  }
   126  
   127  // callComplete records very high-level metrics about the number of completed
   128  // calls and their latency. Currently, this only records statistics at the batch
   129  // level; stats on specific lower-level kv operations are not recorded.
   130  func (nm nodeMetrics) callComplete(d time.Duration, pErr *roachpb.Error) {
   131  	if pErr != nil && pErr.TransactionRestart == roachpb.TransactionRestart_NONE {
   132  		nm.Err.Inc(1)
   133  	} else {
   134  		nm.Success.Inc(1)
   135  	}
   136  	nm.Latency.RecordValue(d.Nanoseconds())
   137  }
   138  
   139  // A Node manages a map of stores (by store ID) for which it serves
   140  // traffic. A node is the top-level data structure. There is one node
   141  // instance per process. A node accepts incoming RPCs and services
   142  // them by directing the commands contained within RPCs to local
   143  // stores, which in turn direct the commands to specific ranges. Each
   144  // node has access to the global, monolithic Key-Value abstraction via
   145  // its client.DB reference. Nodes use this to allocate node and store
   146  // IDs for bootstrapping the node itself or new stores as they're added
   147  // on subsequent instantiations.
   148  type Node struct {
   149  	stopper     *stop.Stopper
   150  	clusterID   *base.ClusterIDContainer // UUID for Cockroach cluster
   151  	Descriptor  roachpb.NodeDescriptor   // Node ID, network/physical topology
   152  	storeCfg    kvserver.StoreConfig     // Config to use and pass to stores
   153  	eventLogger sql.EventLogger
   154  	stores      *kvserver.Stores // Access to node-local stores
   155  	metrics     nodeMetrics
   156  	recorder    *status.MetricsRecorder
   157  	startedAt   int64
   158  	lastUp      int64
   159  	initialBoot bool // True if this is the first time this node has started.
   160  	txnMetrics  kvcoord.TxnMetrics
   161  
   162  	perReplicaServer kvserver.Server
   163  }
   164  
   165  // allocateNodeID increments the node id generator key to allocate
   166  // a new, unique node id.
   167  func allocateNodeID(ctx context.Context, db *kv.DB) (roachpb.NodeID, error) {
   168  	val, err := kv.IncrementValRetryable(ctx, db, keys.NodeIDGenerator, 1)
   169  	if err != nil {
   170  		return 0, errors.Wrap(err, "unable to allocate node ID")
   171  	}
   172  	return roachpb.NodeID(val), nil
   173  }
   174  
   175  // allocateStoreIDs increments the store id generator key for the
   176  // specified node to allocate count new, unique store ids. The
   177  // first ID in a contiguous range is returned on success.
   178  func allocateStoreIDs(
   179  	ctx context.Context, nodeID roachpb.NodeID, count int64, db *kv.DB,
   180  ) (roachpb.StoreID, error) {
   181  	val, err := kv.IncrementValRetryable(ctx, db, keys.StoreIDGenerator, count)
   182  	if err != nil {
   183  		return 0, errors.Wrapf(err, "unable to allocate %d store IDs for node %d", count, nodeID)
   184  	}
   185  	return roachpb.StoreID(val - count + 1), nil
   186  }
   187  
   188  // GetBootstrapSchema returns the schema which will be used to bootstrap a new
   189  // server.
   190  func GetBootstrapSchema(
   191  	defaultZoneConfig *zonepb.ZoneConfig, defaultSystemZoneConfig *zonepb.ZoneConfig,
   192  ) sqlbase.MetadataSchema {
   193  	return sqlbase.MakeMetadataSchema(keys.SystemSQLCodec, defaultZoneConfig, defaultSystemZoneConfig)
   194  }
   195  
   196  // bootstrapCluster initializes the passed-in engines for a new cluster.
   197  // Returns the cluster ID.
   198  //
   199  // The first engine will contain ranges for various static split points (i.e.
   200  // various system ranges and system tables). Note however that many of these
   201  // ranges cannot be accessed by KV in regular means until the node liveness is
   202  // written, since epoch-based leases cannot be granted until then. All other
   203  // engines are initialized with their StoreIdent.
   204  func bootstrapCluster(
   205  	ctx context.Context,
   206  	engines []storage.Engine,
   207  	defaultZoneConfig *zonepb.ZoneConfig,
   208  	defaultSystemZoneConfig *zonepb.ZoneConfig,
   209  ) (*initState, error) {
   210  	clusterID := uuid.MakeV4()
   211  	// TODO(andrei): It'd be cool if this method wouldn't do anything to engines
   212  	// other than the first one, and let regular node startup code deal with them.
   213  	var bootstrapVersion clusterversion.ClusterVersion
   214  	for i, eng := range engines {
   215  		cv, err := kvserver.ReadClusterVersion(ctx, eng)
   216  		if err != nil {
   217  			return nil, errors.Wrapf(err, "reading cluster version of %s", eng)
   218  		} else if cv.Major == 0 {
   219  			return nil, errors.Errorf("missing bootstrap version")
   220  		}
   221  
   222  		// bootstrapCluster requires matching cluster versions on all engines.
   223  		if i == 0 {
   224  			bootstrapVersion = cv
   225  		} else if bootstrapVersion != cv {
   226  			return nil, errors.Wrapf(err, "found cluster versions %s and %s", bootstrapVersion, cv)
   227  		}
   228  
   229  		sIdent := roachpb.StoreIdent{
   230  			ClusterID: clusterID,
   231  			NodeID:    FirstNodeID,
   232  			StoreID:   roachpb.StoreID(i + 1),
   233  		}
   234  
   235  		// Initialize the engine backing the store with the store ident and cluster
   236  		// version.
   237  		if err := kvserver.InitEngine(ctx, eng, sIdent); err != nil {
   238  			return nil, err
   239  		}
   240  
   241  		// Create first range, writing directly to engine. Note this does
   242  		// not create the range, just its data. Only do this if this is the
   243  		// first store.
   244  		if i == 0 {
   245  			schema := GetBootstrapSchema(defaultZoneConfig, defaultSystemZoneConfig)
   246  			initialValues, tableSplits := schema.GetInitialValues()
   247  			splits := append(config.StaticSplits(), tableSplits...)
   248  			sort.Slice(splits, func(i, j int) bool {
   249  				return splits[i].Less(splits[j])
   250  			})
   251  
   252  			if err := kvserver.WriteInitialClusterData(
   253  				ctx, eng, initialValues,
   254  				bootstrapVersion.Version, len(engines), splits,
   255  				hlc.UnixNano(),
   256  			); err != nil {
   257  				return nil, err
   258  			}
   259  		}
   260  	}
   261  
   262  	state := &initState{
   263  		initDiskState: initDiskState{
   264  			nodeID:             FirstNodeID,
   265  			clusterID:          clusterID,
   266  			clusterVersion:     bootstrapVersion,
   267  			initializedEngines: engines,
   268  			newEngines:         nil,
   269  		},
   270  		joined: true,
   271  	}
   272  	return state, nil
   273  }
   274  
   275  // NewNode returns a new instance of Node.
   276  //
   277  // execCfg can be nil to help bootstrapping of a Server (the Node is created
   278  // before the ExecutorConfig is initialized). In that case, InitLogger() needs
   279  // to be called before the Node is used.
   280  func NewNode(
   281  	cfg kvserver.StoreConfig,
   282  	recorder *status.MetricsRecorder,
   283  	reg *metric.Registry,
   284  	stopper *stop.Stopper,
   285  	txnMetrics kvcoord.TxnMetrics,
   286  	execCfg *sql.ExecutorConfig,
   287  	clusterID *base.ClusterIDContainer,
   288  ) *Node {
   289  	var eventLogger sql.EventLogger
   290  	if execCfg != nil {
   291  		eventLogger = sql.MakeEventLogger(execCfg)
   292  	}
   293  	n := &Node{
   294  		storeCfg:    cfg,
   295  		stopper:     stopper,
   296  		recorder:    recorder,
   297  		metrics:     makeNodeMetrics(reg, cfg.HistogramWindowInterval),
   298  		stores:      kvserver.NewStores(cfg.AmbientCtx, cfg.Clock),
   299  		txnMetrics:  txnMetrics,
   300  		eventLogger: eventLogger,
   301  		clusterID:   clusterID,
   302  	}
   303  	n.perReplicaServer = kvserver.MakeServer(&n.Descriptor, n.stores)
   304  	return n
   305  }
   306  
   307  // InitLogger needs to be called if a nil execCfg was passed to NewNode().
   308  func (n *Node) InitLogger(execCfg *sql.ExecutorConfig) {
   309  	n.eventLogger = sql.MakeEventLogger(execCfg)
   310  }
   311  
   312  // String implements fmt.Stringer.
   313  func (n *Node) String() string {
   314  	return fmt.Sprintf("node=%d", n.Descriptor.NodeID)
   315  }
   316  
   317  // AnnotateCtx is a convenience wrapper; see AmbientContext.
   318  func (n *Node) AnnotateCtx(ctx context.Context) context.Context {
   319  	return n.storeCfg.AmbientCtx.AnnotateCtx(ctx)
   320  }
   321  
   322  // AnnotateCtxWithSpan is a convenience wrapper; see AmbientContext.
   323  func (n *Node) AnnotateCtxWithSpan(
   324  	ctx context.Context, opName string,
   325  ) (context.Context, opentracing.Span) {
   326  	return n.storeCfg.AmbientCtx.AnnotateCtxWithSpan(ctx, opName)
   327  }
   328  
   329  // start starts the node by registering the storage instance for the
   330  // RPC service "Node" and initializing stores for each specified
   331  // engine. Launches periodic store gossiping in a goroutine.
   332  // A callback can be optionally provided that will be invoked once this node's
   333  // NodeDescriptor is available, to help bootstrapping.
   334  func (n *Node) start(
   335  	ctx context.Context,
   336  	addr, sqlAddr net.Addr,
   337  	state initState,
   338  	clusterName string,
   339  	attrs roachpb.Attributes,
   340  	locality roachpb.Locality,
   341  	localityAddress []roachpb.LocalityAddress,
   342  	nodeDescriptorCallback func(descriptor roachpb.NodeDescriptor),
   343  ) error {
   344  	// Obtaining the NodeID requires a dance of sorts. If the node has initialized
   345  	// stores, the NodeID is persisted in each of them. If not, then we'll need to
   346  	// use the KV store to get a NodeID assigned.
   347  	n.initialBoot = state.joined
   348  	nodeID := state.nodeID
   349  	if nodeID == 0 {
   350  		if !state.joined {
   351  			log.Fatalf(ctx, "node has no NodeID, but claims to not be joining cluster")
   352  		}
   353  		// Allocate NodeID. Note that Gossip is already connected because if there's
   354  		// no NodeID yet, this means that we had to connect Gossip to learn the ClusterID.
   355  		select {
   356  		case <-n.storeCfg.Gossip.Connected:
   357  		default:
   358  			log.Fatalf(ctx, "Gossip is not connected yet")
   359  		}
   360  		ctxWithSpan, span := n.AnnotateCtxWithSpan(ctx, "alloc-node-id")
   361  		newID, err := allocateNodeID(ctxWithSpan, n.storeCfg.DB)
   362  		if err != nil {
   363  			return err
   364  		}
   365  		log.Infof(ctxWithSpan, "new node allocated ID %d", newID)
   366  		span.Finish()
   367  		nodeID = newID
   368  	}
   369  
   370  	// Inform the RPC context of the node ID.
   371  	n.storeCfg.RPCContext.NodeID.Set(ctx, nodeID)
   372  
   373  	n.startedAt = n.storeCfg.Clock.Now().WallTime
   374  	n.Descriptor = roachpb.NodeDescriptor{
   375  		NodeID:          nodeID,
   376  		Address:         util.MakeUnresolvedAddr(addr.Network(), addr.String()),
   377  		SQLAddress:      util.MakeUnresolvedAddr(sqlAddr.Network(), sqlAddr.String()),
   378  		Attrs:           attrs,
   379  		Locality:        locality,
   380  		LocalityAddress: localityAddress,
   381  		ClusterName:     clusterName,
   382  		ServerVersion:   n.storeCfg.Settings.Version.BinaryVersion(),
   383  		BuildTag:        build.GetInfo().Tag,
   384  		StartedAt:       n.startedAt,
   385  	}
   386  	// Invoke any passed in nodeDescriptorCallback as soon as it's available, to
   387  	// ensure that other components (currently the DistSQLPlanner) are initialized
   388  	// before store startup continues.
   389  	if nodeDescriptorCallback != nil {
   390  		nodeDescriptorCallback(n.Descriptor)
   391  	}
   392  
   393  	// Gossip the node descriptor to make this node addressable by node ID.
   394  	n.storeCfg.Gossip.NodeID.Set(ctx, n.Descriptor.NodeID)
   395  	if err := n.storeCfg.Gossip.SetNodeDescriptor(&n.Descriptor); err != nil {
   396  		return errors.Errorf("couldn't gossip descriptor for node %d: %s", n.Descriptor.NodeID, err)
   397  	}
   398  
   399  	// Start the closed timestamp subsystem.
   400  	n.storeCfg.ClosedTimestamp.Start(n.Descriptor.NodeID)
   401  
   402  	// Create stores from the engines that were already bootstrapped.
   403  	for _, e := range state.initializedEngines {
   404  		s := kvserver.NewStore(ctx, n.storeCfg, e, &n.Descriptor)
   405  		if err := s.Start(ctx, n.stopper); err != nil {
   406  			return errors.Errorf("failed to start store: %s", err)
   407  		}
   408  		capacity, err := s.Capacity(false /* useCached */)
   409  		if err != nil {
   410  			return errors.Errorf("could not query store capacity: %s", err)
   411  		}
   412  		log.Infof(ctx, "initialized store %s: %+v", s, capacity)
   413  
   414  		n.addStore(s)
   415  	}
   416  
   417  	// Verify all initialized stores agree on cluster and node IDs.
   418  	if err := n.validateStores(ctx); err != nil {
   419  		return err
   420  	}
   421  	log.VEventf(ctx, 2, "validated stores")
   422  
   423  	// Compute the time this node was last up; this is done by reading the
   424  	// "last up time" from every store and choosing the most recent timestamp.
   425  	var mostRecentTimestamp hlc.Timestamp
   426  	if err := n.stores.VisitStores(func(s *kvserver.Store) error {
   427  		timestamp, err := s.ReadLastUpTimestamp(ctx)
   428  		if err != nil {
   429  			return err
   430  		}
   431  		if mostRecentTimestamp.Less(timestamp) {
   432  			mostRecentTimestamp = timestamp
   433  		}
   434  		return nil
   435  	}); err != nil {
   436  		return errors.Wrapf(err, "failed to read last up timestamp from stores")
   437  	}
   438  	n.lastUp = mostRecentTimestamp.WallTime
   439  
   440  	// Set the stores map as the gossip persistent storage, so that
   441  	// gossip can bootstrap using the most recently persisted set of
   442  	// node addresses.
   443  	if err := n.storeCfg.Gossip.SetStorage(n.stores); err != nil {
   444  		return fmt.Errorf("failed to initialize the gossip interface: %s", err)
   445  	}
   446  
   447  	// Bootstrap any uninitialized stores.
   448  	//
   449  	// TODO(tbg): address https://github.com/cockroachdb/cockroach/issues/39415.
   450  	// Should be easy enough. Writing the test is probably most of the work.
   451  	if len(state.newEngines) > 0 {
   452  		if err := n.bootstrapStores(ctx, state.newEngines, n.stopper); err != nil {
   453  			return err
   454  		}
   455  	}
   456  
   457  	n.startComputePeriodicMetrics(n.stopper, base.DefaultMetricsSampleInterval)
   458  
   459  	// Be careful about moving this line above `startStores`; store migrations rely
   460  	// on the fact that the cluster version has not been updated via Gossip (we
   461  	// have migrations that want to run only if the server starts with a given
   462  	// cluster version, but not if the server starts with a lower one and gets
   463  	// bumped immediately, which would be possible if gossip got started earlier).
   464  	n.startGossip(ctx, n.stopper)
   465  
   466  	allEngines := append([]storage.Engine(nil), state.initializedEngines...)
   467  	allEngines = append(allEngines, state.newEngines...)
   468  	log.Infof(ctx, "%s: started with %v engine(s) and attributes %v", n, allEngines, attrs.Attrs)
   469  	return nil
   470  }
   471  
   472  // IsDraining returns true if at least one Store housed on this Node is not
   473  // currently allowing range leases to be procured or extended.
   474  func (n *Node) IsDraining() bool {
   475  	var isDraining bool
   476  	if err := n.stores.VisitStores(func(s *kvserver.Store) error {
   477  		isDraining = isDraining || s.IsDraining()
   478  		return nil
   479  	}); err != nil {
   480  		panic(err)
   481  	}
   482  	return isDraining
   483  }
   484  
   485  // SetDraining sets the draining mode on all of the node's underlying stores.
   486  // The reporter callback, if non-nil, is called on a best effort basis
   487  // to report work that needed to be done and which may or may not have
   488  // been done by the time this call returns. See the explanation in
   489  // pkg/server/drain.go for details.
   490  func (n *Node) SetDraining(drain bool, reporter func(int, string)) error {
   491  	return n.stores.VisitStores(func(s *kvserver.Store) error {
   492  		s.SetDraining(drain, reporter)
   493  		return nil
   494  	})
   495  }
   496  
   497  // SetHLCUpperBound sets the upper bound of the HLC wall time on all of the
   498  // node's underlying stores.
   499  func (n *Node) SetHLCUpperBound(ctx context.Context, hlcUpperBound int64) error {
   500  	return n.stores.VisitStores(func(s *kvserver.Store) error {
   501  		return s.WriteHLCUpperBound(ctx, hlcUpperBound)
   502  	})
   503  }
   504  
   505  func (n *Node) addStore(store *kvserver.Store) {
   506  	cv, err := store.GetClusterVersion(context.TODO())
   507  	if err != nil {
   508  		log.Fatalf(context.TODO(), "%v", err)
   509  	}
   510  	if cv == (clusterversion.ClusterVersion{}) {
   511  		// The store should have had a version written to it during the store
   512  		// bootstrap process.
   513  		log.Fatal(context.TODO(), "attempting to add a store without a version")
   514  	}
   515  	n.stores.AddStore(store)
   516  	n.recorder.AddStore(store)
   517  }
   518  
   519  // validateStores iterates over all stores, verifying they agree on node ID.
   520  // The node's ident is initialized based on the agreed-upon node ID. Note that
   521  // cluster ID consistency is checked elsewhere in inspectEngines.
   522  //
   523  // TODO(tbg): remove this, we already validate everything in inspectEngines now.
   524  func (n *Node) validateStores(ctx context.Context) error {
   525  	return n.stores.VisitStores(func(s *kvserver.Store) error {
   526  		if n.Descriptor.NodeID != s.Ident.NodeID {
   527  			return errors.Errorf("store %s node ID doesn't match node ID: %d", s, n.Descriptor.NodeID)
   528  		}
   529  		return nil
   530  	})
   531  }
   532  
   533  // bootstrapStores bootstraps uninitialized stores once the cluster
   534  // and node IDs have been established for this node. Store IDs are
   535  // allocated via a sequence id generator stored at a system key per
   536  // node. The new stores are added to n.stores.
   537  func (n *Node) bootstrapStores(
   538  	ctx context.Context, emptyEngines []storage.Engine, stopper *stop.Stopper,
   539  ) error {
   540  	if n.clusterID.Get() == uuid.Nil {
   541  		return errors.New("ClusterID missing during store bootstrap of auxiliary store")
   542  	}
   543  
   544  	{
   545  		// Bootstrap all waiting stores by allocating a new store id for
   546  		// each and invoking storage.Bootstrap() to persist it and the cluster
   547  		// version and to create stores.
   548  		inc := int64(len(emptyEngines))
   549  		firstID, err := allocateStoreIDs(ctx, n.Descriptor.NodeID, inc, n.storeCfg.DB)
   550  		if err != nil {
   551  			return errors.Errorf("error allocating store ids: %s", err)
   552  		}
   553  		sIdent := roachpb.StoreIdent{
   554  			ClusterID: n.clusterID.Get(),
   555  			NodeID:    n.Descriptor.NodeID,
   556  			StoreID:   firstID,
   557  		}
   558  		for _, eng := range emptyEngines {
   559  			if err := kvserver.InitEngine(ctx, eng, sIdent); err != nil {
   560  				return err
   561  			}
   562  
   563  			s := kvserver.NewStore(ctx, n.storeCfg, eng, &n.Descriptor)
   564  			if err := s.Start(ctx, stopper); err != nil {
   565  				return err
   566  			}
   567  			n.addStore(s)
   568  			log.Infof(ctx, "bootstrapped store %s", s)
   569  			// Done regularly in Node.startGossip, but this cuts down the time
   570  			// until this store is used for range allocations.
   571  			if err := s.GossipStore(ctx, false /* useCached */); err != nil {
   572  				log.Warningf(ctx, "error doing initial gossiping: %s", err)
   573  			}
   574  
   575  			sIdent.StoreID++
   576  		}
   577  	}
   578  
   579  	// write a new status summary after all stores have been bootstrapped; this
   580  	// helps the UI remain responsive when new nodes are added.
   581  	if err := n.writeNodeStatus(ctx, 0 /* alertTTL */); err != nil {
   582  		log.Warningf(ctx, "error writing node summary after store bootstrap: %s", err)
   583  	}
   584  
   585  	return nil
   586  }
   587  
   588  // startGossip loops on a periodic ticker to gossip node-related
   589  // information. Starts a goroutine to loop until the node is closed.
   590  func (n *Node) startGossip(ctx context.Context, stopper *stop.Stopper) {
   591  	ctx = n.AnnotateCtx(ctx)
   592  	stopper.RunWorker(ctx, func(ctx context.Context) {
   593  		// Verify we've already gossiped our node descriptor.
   594  		//
   595  		// TODO(tbg): see if we really needed to do this earlier already. We
   596  		// probably needed to (this call has to come late for ... reasons I
   597  		// still need to look into) and nobody can talk to this node until
   598  		// the descriptor is in Gossip.
   599  		if _, err := n.storeCfg.Gossip.GetNodeDescriptor(n.Descriptor.NodeID); err != nil {
   600  			panic(err)
   601  		}
   602  
   603  		// NB: Gossip may not be connected at this point. That's fine though,
   604  		// we can still gossip something; Gossip sends it out reactively once
   605  		// it can.
   606  
   607  		statusTicker := time.NewTicker(gossipStatusInterval)
   608  		storesTicker := time.NewTicker(gossip.StoresInterval)
   609  		nodeTicker := time.NewTicker(gossip.NodeDescriptorInterval)
   610  		defer storesTicker.Stop()
   611  		defer nodeTicker.Stop()
   612  		n.gossipStores(ctx) // one-off run before going to sleep
   613  		for {
   614  			select {
   615  			case <-statusTicker.C:
   616  				n.storeCfg.Gossip.LogStatus()
   617  			case <-storesTicker.C:
   618  				n.gossipStores(ctx)
   619  			case <-nodeTicker.C:
   620  				if err := n.storeCfg.Gossip.SetNodeDescriptor(&n.Descriptor); err != nil {
   621  					log.Warningf(ctx, "couldn't gossip descriptor for node %d: %s", n.Descriptor.NodeID, err)
   622  				}
   623  			case <-stopper.ShouldStop():
   624  				return
   625  			}
   626  		}
   627  	})
   628  }
   629  
   630  // gossipStores broadcasts each store and dead replica to the gossip network.
   631  func (n *Node) gossipStores(ctx context.Context) {
   632  	if err := n.stores.VisitStores(func(s *kvserver.Store) error {
   633  		return s.GossipStore(ctx, false /* useCached */)
   634  	}); err != nil {
   635  		log.Warningf(ctx, "%v", err)
   636  	}
   637  }
   638  
   639  // startComputePeriodicMetrics starts a loop which periodically instructs each
   640  // store to compute the value of metrics which cannot be incrementally
   641  // maintained.
   642  func (n *Node) startComputePeriodicMetrics(stopper *stop.Stopper, interval time.Duration) {
   643  	ctx := n.AnnotateCtx(context.Background())
   644  	stopper.RunWorker(ctx, func(ctx context.Context) {
   645  		// Compute periodic stats at the same frequency as metrics are sampled.
   646  		ticker := time.NewTicker(interval)
   647  		defer ticker.Stop()
   648  		for tick := 0; ; tick++ {
   649  			select {
   650  			case <-ticker.C:
   651  				if err := n.computePeriodicMetrics(ctx, tick); err != nil {
   652  					log.Errorf(ctx, "failed computing periodic metrics: %s", err)
   653  				}
   654  			case <-stopper.ShouldStop():
   655  				return
   656  			}
   657  		}
   658  	})
   659  }
   660  
   661  // computePeriodicMetrics instructs each store to compute the value of
   662  // complicated metrics.
   663  func (n *Node) computePeriodicMetrics(ctx context.Context, tick int) error {
   664  	return n.stores.VisitStores(func(store *kvserver.Store) error {
   665  		if err := store.ComputeMetrics(ctx, tick); err != nil {
   666  			log.Warningf(ctx, "%s: unable to compute metrics: %s", store, err)
   667  		}
   668  		return nil
   669  	})
   670  }
   671  
   672  func (n *Node) startGraphiteStatsExporter(st *cluster.Settings) {
   673  	ctx := logtags.AddTag(n.AnnotateCtx(context.Background()), "graphite stats exporter", nil)
   674  	pm := metric.MakePrometheusExporter()
   675  
   676  	n.stopper.RunWorker(ctx, func(ctx context.Context) {
   677  		var timer timeutil.Timer
   678  		defer timer.Stop()
   679  		for {
   680  			timer.Reset(graphiteInterval.Get(&st.SV))
   681  			select {
   682  			case <-n.stopper.ShouldStop():
   683  				return
   684  			case <-timer.C:
   685  				timer.Read = true
   686  				endpoint := graphiteEndpoint.Get(&st.SV)
   687  				if endpoint != "" {
   688  					if err := n.recorder.ExportToGraphite(ctx, endpoint, &pm); err != nil {
   689  						log.Infof(ctx, "error pushing metrics to graphite: %s\n", err)
   690  					}
   691  				}
   692  			}
   693  		}
   694  	})
   695  }
   696  
   697  // startWriteNodeStatus begins periodically persisting status summaries for the
   698  // node and its stores.
   699  func (n *Node) startWriteNodeStatus(frequency time.Duration) {
   700  	ctx := logtags.AddTag(n.AnnotateCtx(context.Background()), "summaries", nil)
   701  	// Immediately record summaries once on server startup.
   702  	if err := n.writeNodeStatus(ctx, 0 /* alertTTL */); err != nil {
   703  		log.Warningf(ctx, "error recording initial status summaries: %s", err)
   704  	}
   705  	n.stopper.RunWorker(ctx, func(ctx context.Context) {
   706  		// Write a status summary immediately; this helps the UI remain
   707  		// responsive when new nodes are added.
   708  		ticker := time.NewTicker(frequency)
   709  		defer ticker.Stop()
   710  		for {
   711  			select {
   712  			case <-ticker.C:
   713  				// Use an alertTTL of twice the ticker frequency. This makes sure that
   714  				// alerts don't disappear and reappear spuriously while at the same
   715  				// time ensuring that an alert doesn't linger for too long after having
   716  				// resolved.
   717  				if err := n.writeNodeStatus(ctx, 2*frequency); err != nil {
   718  					log.Warningf(ctx, "error recording status summaries: %s", err)
   719  				}
   720  			case <-n.stopper.ShouldStop():
   721  				return
   722  			}
   723  		}
   724  	})
   725  }
   726  
   727  // writeNodeStatus retrieves status summaries from the supplied
   728  // NodeStatusRecorder and persists them to the cockroach data store.
   729  func (n *Node) writeNodeStatus(ctx context.Context, alertTTL time.Duration) error {
   730  	var err error
   731  	if runErr := n.stopper.RunTask(ctx, "node.Node: writing summary", func(ctx context.Context) {
   732  		nodeStatus := n.recorder.GenerateNodeStatus(ctx)
   733  		if nodeStatus == nil {
   734  			return
   735  		}
   736  
   737  		if result := n.recorder.CheckHealth(ctx, *nodeStatus); len(result.Alerts) != 0 {
   738  			var numNodes int
   739  			if err := n.storeCfg.Gossip.IterateInfos(gossip.KeyNodeIDPrefix, func(k string, info gossip.Info) error {
   740  				numNodes++
   741  				return nil
   742  			}); err != nil {
   743  				log.Warningf(ctx, "%v", err)
   744  			}
   745  			if numNodes > 1 {
   746  				// Avoid this warning on single-node clusters, which require special UX.
   747  				log.Warningf(ctx, "health alerts detected: %+v", result)
   748  			}
   749  			if err := n.storeCfg.Gossip.AddInfoProto(
   750  				gossip.MakeNodeHealthAlertKey(n.Descriptor.NodeID), &result, alertTTL,
   751  			); err != nil {
   752  				log.Warningf(ctx, "unable to gossip health alerts: %+v", result)
   753  			}
   754  
   755  			// TODO(tschottdorf): add a metric that we increment every time there are
   756  			// alerts. This can help understand how long the cluster has been in that
   757  			// state (since it'll be incremented every ~10s).
   758  		}
   759  
   760  		err = n.recorder.WriteNodeStatus(ctx, n.storeCfg.DB, *nodeStatus)
   761  	}); runErr != nil {
   762  		err = runErr
   763  	}
   764  	return err
   765  }
   766  
   767  // recordJoinEvent begins an asynchronous task which attempts to log a "node
   768  // join" or "node restart" event. This query will retry until it succeeds or the
   769  // server stops.
   770  func (n *Node) recordJoinEvent() {
   771  	if !n.storeCfg.LogRangeEvents {
   772  		return
   773  	}
   774  
   775  	logEventType := sql.EventLogNodeRestart
   776  	lastUp := n.lastUp
   777  	if n.initialBoot {
   778  		logEventType = sql.EventLogNodeJoin
   779  		lastUp = n.startedAt
   780  	}
   781  
   782  	n.stopper.RunWorker(context.Background(), func(bgCtx context.Context) {
   783  		ctx, span := n.AnnotateCtxWithSpan(bgCtx, "record-join-event")
   784  		defer span.Finish()
   785  		retryOpts := base.DefaultRetryOptions()
   786  		retryOpts.Closer = n.stopper.ShouldStop()
   787  		for r := retry.Start(retryOpts); r.Next(); {
   788  			if err := n.storeCfg.DB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   789  				return n.eventLogger.InsertEventRecord(
   790  					ctx,
   791  					txn,
   792  					logEventType,
   793  					int32(n.Descriptor.NodeID),
   794  					int32(n.Descriptor.NodeID),
   795  					struct {
   796  						Descriptor roachpb.NodeDescriptor
   797  						ClusterID  uuid.UUID
   798  						StartedAt  int64
   799  						LastUp     int64
   800  					}{n.Descriptor, n.clusterID.Get(), n.startedAt, lastUp},
   801  				)
   802  			}); err != nil {
   803  				log.Warningf(ctx, "%s: unable to log %s event: %s", n, logEventType, err)
   804  			} else {
   805  				return
   806  			}
   807  		}
   808  	})
   809  }
   810  
   811  // If we receive a (proto-marshaled) roachpb.BatchRequest whose Requests contain
   812  // a message type unknown to this node, we will end up with a zero entry in the
   813  // slice. If we don't error out early, this breaks all sorts of assumptions and
   814  // usually ends in a panic.
   815  func checkNoUnknownRequest(reqs []roachpb.RequestUnion) *roachpb.UnsupportedRequestError {
   816  	for _, req := range reqs {
   817  		if req.GetValue() == nil {
   818  			return &roachpb.UnsupportedRequestError{}
   819  		}
   820  	}
   821  	return nil
   822  }
   823  
   824  func (n *Node) batchInternal(
   825  	ctx context.Context, args *roachpb.BatchRequest,
   826  ) (*roachpb.BatchResponse, error) {
   827  	if detail := checkNoUnknownRequest(args.Requests); detail != nil {
   828  		var br roachpb.BatchResponse
   829  		br.Error = roachpb.NewError(detail)
   830  		return &br, nil
   831  	}
   832  
   833  	var br *roachpb.BatchResponse
   834  	if err := n.stopper.RunTaskWithErr(ctx, "node.Node: batch", func(ctx context.Context) error {
   835  		var finishSpan func(*roachpb.BatchResponse)
   836  		// Shadow ctx from the outer function. Written like this to pass the linter.
   837  		ctx, finishSpan = n.setupSpanForIncomingRPC(ctx, grpcutil.IsLocalRequestContext(ctx))
   838  		// NB: wrapped to delay br evaluation to its value when returning.
   839  		defer func() { finishSpan(br) }()
   840  		if log.HasSpanOrEvent(ctx) {
   841  			log.Eventf(ctx, "node received request: %s", args.Summary())
   842  		}
   843  
   844  		tStart := timeutil.Now()
   845  		var pErr *roachpb.Error
   846  		br, pErr = n.stores.Send(ctx, *args)
   847  		if pErr != nil {
   848  			br = &roachpb.BatchResponse{}
   849  			log.VErrEventf(ctx, 3, "%T", pErr.GetDetail())
   850  		}
   851  		if br.Error != nil {
   852  			panic(roachpb.ErrorUnexpectedlySet(n.stores, br))
   853  		}
   854  		n.metrics.callComplete(timeutil.Since(tStart), pErr)
   855  		br.Error = pErr
   856  		return nil
   857  	}); err != nil {
   858  		return nil, err
   859  	}
   860  	return br, nil
   861  }
   862  
   863  // Batch implements the roachpb.InternalServer interface.
   864  func (n *Node) Batch(
   865  	ctx context.Context, args *roachpb.BatchRequest,
   866  ) (*roachpb.BatchResponse, error) {
   867  	// NB: Node.Batch is called directly for "local" calls. We don't want to
   868  	// carry the associated log tags forward as doing so makes adding additional
   869  	// log tags more expensive and makes local calls differ from remote calls.
   870  	ctx = n.storeCfg.AmbientCtx.ResetAndAnnotateCtx(ctx)
   871  
   872  	br, err := n.batchInternal(ctx, args)
   873  
   874  	// We always return errors via BatchResponse.Error so structure is
   875  	// preserved; plain errors are presumed to be from the RPC
   876  	// framework and not from cockroach.
   877  	if err != nil {
   878  		if br == nil {
   879  			br = &roachpb.BatchResponse{}
   880  		}
   881  		if br.Error != nil {
   882  			log.Fatalf(
   883  				ctx, "attempting to return both a plain error (%s) and roachpb.Error (%s)", err, br.Error,
   884  			)
   885  		}
   886  		br.Error = roachpb.NewError(err)
   887  	}
   888  	return br, nil
   889  }
   890  
   891  // setupSpanForIncomingRPC takes a context and returns a derived context with a
   892  // new span in it. Depending on the input context, that span might be a root
   893  // span or a child span. If it is a child span, it might be a child span of a
   894  // local or a remote span. Note that supporting both the "child of local span"
   895  // and "child of remote span" cases are important, as this RPC can be called
   896  // either through the network or directly if the caller is local.
   897  //
   898  // It returns the derived context and a cleanup function to be called when
   899  // servicing the RPC is done. The cleanup function will close the span and, in
   900  // case the span was the child of a remote span and "snowball tracing" was
   901  // enabled on that parent span, it serializes the local trace into the
   902  // BatchResponse. The cleanup function takes the BatchResponse in which the
   903  // response is to serialized. The BatchResponse can be nil in case no response
   904  // is to be returned to the rpc caller.
   905  func (n *Node) setupSpanForIncomingRPC(
   906  	ctx context.Context, isLocalRequest bool,
   907  ) (context.Context, func(*roachpb.BatchResponse)) {
   908  	// The operation name matches the one created by the interceptor in the
   909  	// remoteTrace case below.
   910  	const opName = "/cockroach.roachpb.Internal/Batch"
   911  	var newSpan, grpcSpan opentracing.Span
   912  	if isLocalRequest {
   913  		// This is a local request which circumvented gRPC. Start a span now.
   914  		ctx, newSpan = tracing.ChildSpan(ctx, opName)
   915  	} else {
   916  		grpcSpan = opentracing.SpanFromContext(ctx)
   917  		if grpcSpan == nil {
   918  			// If tracing information was passed via gRPC metadata, the gRPC interceptor
   919  			// should have opened a span for us. If not, open a span now (if tracing is
   920  			// disabled, this will be a noop span).
   921  			newSpan = n.storeCfg.AmbientCtx.Tracer.(*tracing.Tracer).StartRootSpan(
   922  				opName, n.storeCfg.AmbientCtx.LogTags(), tracing.NonRecordableSpan,
   923  			)
   924  			ctx = opentracing.ContextWithSpan(ctx, newSpan)
   925  		} else {
   926  			grpcSpan.SetTag("node", n.Descriptor.NodeID)
   927  		}
   928  	}
   929  
   930  	finishSpan := func(br *roachpb.BatchResponse) {
   931  		if newSpan != nil {
   932  			newSpan.Finish()
   933  		}
   934  		if br == nil {
   935  			return
   936  		}
   937  		if grpcSpan != nil {
   938  			// If this is a "snowball trace", we'll need to return all the recorded
   939  			// spans in the BatchResponse at the end of the request.
   940  			// We don't want to do this if the operation is on the same host, in which
   941  			// case everything is already part of the same recording.
   942  			if rec := tracing.GetRecording(grpcSpan); rec != nil {
   943  				br.CollectedSpans = append(br.CollectedSpans, rec...)
   944  			}
   945  		}
   946  	}
   947  	return ctx, finishSpan
   948  }
   949  
   950  // RangeFeed implements the roachpb.InternalServer interface.
   951  func (n *Node) RangeFeed(
   952  	args *roachpb.RangeFeedRequest, stream roachpb.Internal_RangeFeedServer,
   953  ) error {
   954  	growstack.Grow()
   955  
   956  	pErr := n.stores.RangeFeed(args, stream)
   957  	if pErr != nil {
   958  		var event roachpb.RangeFeedEvent
   959  		event.SetValue(&roachpb.RangeFeedError{
   960  			Error: *pErr,
   961  		})
   962  		return stream.Send(&event)
   963  	}
   964  	return nil
   965  }