github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/store.go (about)

     1  // Copyright 2014 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"fmt"
    17  	"math"
    18  	"os"
    19  	"path/filepath"
    20  	"runtime"
    21  	"sort"
    22  	"strings"
    23  	"sync"
    24  	"sync/atomic"
    25  	"time"
    26  	"unsafe"
    27  
    28  	"github.com/cockroachdb/cockroach/pkg/base"
    29  	"github.com/cockroachdb/cockroach/pkg/clusterversion"
    30  	"github.com/cockroachdb/cockroach/pkg/config"
    31  	"github.com/cockroachdb/cockroach/pkg/config/zonepb"
    32  	"github.com/cockroachdb/cockroach/pkg/gossip"
    33  	"github.com/cockroachdb/cockroach/pkg/keys"
    34  	"github.com/cockroachdb/cockroach/pkg/kv"
    35  	"github.com/cockroachdb/cockroach/pkg/kv/kvbase"
    36  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval"
    37  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/container"
    38  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/ctpb"
    39  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/compactor"
    40  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/idalloc"
    41  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/intentresolver"
    42  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts"
    43  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/raftentry"
    44  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/tscache"
    45  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/txnrecovery"
    46  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/txnwait"
    47  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    48  	"github.com/cockroachdb/cockroach/pkg/rpc"
    49  	"github.com/cockroachdb/cockroach/pkg/rpc/nodedialer"
    50  	"github.com/cockroachdb/cockroach/pkg/settings"
    51  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    52  	"github.com/cockroachdb/cockroach/pkg/sql/sqlutil"
    53  	"github.com/cockroachdb/cockroach/pkg/storage"
    54  	"github.com/cockroachdb/cockroach/pkg/storage/cloud"
    55  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    56  	"github.com/cockroachdb/cockroach/pkg/util/contextutil"
    57  	"github.com/cockroachdb/cockroach/pkg/util/envutil"
    58  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    59  	"github.com/cockroachdb/cockroach/pkg/util/limit"
    60  	"github.com/cockroachdb/cockroach/pkg/util/log"
    61  	"github.com/cockroachdb/cockroach/pkg/util/metric"
    62  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    63  	"github.com/cockroachdb/cockroach/pkg/util/quotapool"
    64  	"github.com/cockroachdb/cockroach/pkg/util/retry"
    65  	"github.com/cockroachdb/cockroach/pkg/util/shuffle"
    66  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    67  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    68  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    69  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    70  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    71  	"github.com/cockroachdb/errors"
    72  	"github.com/cockroachdb/logtags"
    73  	"github.com/google/btree"
    74  	"go.etcd.io/etcd/raft"
    75  	"golang.org/x/time/rate"
    76  )
    77  
    78  const (
    79  	// rangeIDAllocCount is the number of Range IDs to allocate per allocation.
    80  	rangeIDAllocCount                 = 10
    81  	defaultRaftHeartbeatIntervalTicks = 5
    82  
    83  	// defaultRaftEntryCacheSize is the default size in bytes for a
    84  	// store's Raft log entry cache.
    85  	defaultRaftEntryCacheSize = 1 << 24 // 16M
    86  
    87  	// replicaRequestQueueSize specifies the maximum number of requests to queue
    88  	// for a replica.
    89  	replicaRequestQueueSize = 100
    90  
    91  	defaultGossipWhenCapacityDeltaExceedsFraction = 0.01
    92  
    93  	// systemDataGossipInterval is the interval at which range lease
    94  	// holders verify that the most recent system data is gossiped.
    95  	// This ensures that system data is always eventually gossiped, even
    96  	// if a range lease holder experiences a failure causing a missed
    97  	// gossip update.
    98  	systemDataGossipInterval = 1 * time.Minute
    99  )
   100  
   101  var storeSchedulerConcurrency = envutil.EnvOrDefaultInt(
   102  	"COCKROACH_SCHEDULER_CONCURRENCY", 8*runtime.NumCPU())
   103  
   104  var logSSTInfoTicks = envutil.EnvOrDefaultInt(
   105  	"COCKROACH_LOG_SST_INFO_TICKS_INTERVAL", 60,
   106  )
   107  
   108  // bulkIOWriteLimit is defined here because it is used by BulkIOWriteLimiter.
   109  var bulkIOWriteLimit = settings.RegisterPublicByteSizeSetting(
   110  	"kv.bulk_io_write.max_rate",
   111  	"the rate limit (bytes/sec) to use for writes to disk on behalf of bulk io ops",
   112  	1<<40,
   113  )
   114  
   115  // importRequestsLimit limits concurrent import requests.
   116  var importRequestsLimit = settings.RegisterPositiveIntSetting(
   117  	"kv.bulk_io_write.concurrent_import_requests",
   118  	"number of import requests a store will handle concurrently before queuing",
   119  	1,
   120  )
   121  
   122  // addSSTableRequestLimit limits concurrent AddSSTable requests.
   123  var addSSTableRequestLimit = settings.RegisterPositiveIntSetting(
   124  	"kv.bulk_io_write.concurrent_addsstable_requests",
   125  	"number of AddSSTable requests a store will handle concurrently before queuing",
   126  	1,
   127  )
   128  
   129  // concurrentRangefeedItersLimit limits concurrent rangefeed catchup iterators.
   130  var concurrentRangefeedItersLimit = settings.RegisterPositiveIntSetting(
   131  	"kv.rangefeed.concurrent_catchup_iterators",
   132  	"number of rangefeeds catchup iterators a store will allow concurrently before queueing",
   133  	64,
   134  )
   135  
   136  // raftLeadershipTransferTimeout limits the amount of time a drain command
   137  // waits for lease transfers.
   138  var raftLeadershipTransferWait = func() *settings.DurationSetting {
   139  	s := settings.RegisterValidatedDurationSetting(
   140  		raftLeadershipTransferWaitKey,
   141  		"the amount of time a server waits to transfer range leases before proceeding with the rest of the shutdown process",
   142  		5*time.Second,
   143  		func(v time.Duration) error {
   144  			if v < 0 {
   145  				return errors.Errorf("cannot set %s to a negative duration: %s",
   146  					raftLeadershipTransferWaitKey, v)
   147  			}
   148  			return nil
   149  		},
   150  	)
   151  	s.SetVisibility(settings.Public)
   152  	return s
   153  }()
   154  
   155  const raftLeadershipTransferWaitKey = "server.shutdown.lease_transfer_wait"
   156  
   157  // ExportRequestsLimit is the number of Export requests that can run at once.
   158  // Each extracts data from RocksDB to a temp file and then uploads it to cloud
   159  // storage. In order to not exhaust the disk or memory, or saturate the network,
   160  // limit the number of these that can be run in parallel. This number was chosen
   161  // by a guessing - it could be improved by more measured heuristics. Exported
   162  // here since we check it in in the caller to limit generated requests as well
   163  // to prevent excessive queuing.
   164  var ExportRequestsLimit = settings.RegisterPositiveIntSetting(
   165  	"kv.bulk_io_write.concurrent_export_requests",
   166  	"number of export requests a store will handle concurrently before queuing",
   167  	3,
   168  )
   169  
   170  // TestStoreConfig has some fields initialized with values relevant in tests.
   171  func TestStoreConfig(clock *hlc.Clock) StoreConfig {
   172  	if clock == nil {
   173  		clock = hlc.NewClock(hlc.UnixNano, time.Nanosecond)
   174  	}
   175  	st := cluster.MakeTestingClusterSettings()
   176  	sc := StoreConfig{
   177  		DefaultZoneConfig:           zonepb.DefaultZoneConfigRef(),
   178  		DefaultSystemZoneConfig:     zonepb.DefaultSystemZoneConfigRef(),
   179  		Settings:                    st,
   180  		AmbientCtx:                  log.AmbientContext{Tracer: st.Tracer},
   181  		Clock:                       clock,
   182  		CoalescedHeartbeatsInterval: 50 * time.Millisecond,
   183  		RaftHeartbeatIntervalTicks:  1,
   184  		ScanInterval:                10 * time.Minute,
   185  		HistogramWindowInterval:     metric.TestSampleInterval,
   186  		EnableEpochRangeLeases:      true,
   187  		ClosedTimestamp:             container.NoopContainer(),
   188  		ProtectedTimestampCache:     protectedts.EmptyCache(clock),
   189  	}
   190  
   191  	// Use shorter Raft tick settings in order to minimize start up and failover
   192  	// time in tests.
   193  	sc.RaftElectionTimeoutTicks = 3
   194  	sc.RaftTickInterval = 100 * time.Millisecond
   195  	sc.SetDefaults()
   196  	return sc
   197  }
   198  
   199  func newRaftConfig(
   200  	strg raft.Storage, id uint64, appliedIndex uint64, storeCfg StoreConfig, logger raft.Logger,
   201  ) *raft.Config {
   202  	return &raft.Config{
   203  		ID:                        id,
   204  		Applied:                   appliedIndex,
   205  		ElectionTick:              storeCfg.RaftElectionTimeoutTicks,
   206  		HeartbeatTick:             storeCfg.RaftHeartbeatIntervalTicks,
   207  		MaxUncommittedEntriesSize: storeCfg.RaftMaxUncommittedEntriesSize,
   208  		MaxCommittedSizePerReady:  storeCfg.RaftMaxCommittedSizePerReady,
   209  		MaxSizePerMsg:             storeCfg.RaftMaxSizePerMsg,
   210  		MaxInflightMsgs:           storeCfg.RaftMaxInflightMsgs,
   211  		Storage:                   strg,
   212  		Logger:                    logger,
   213  
   214  		PreVote: true,
   215  	}
   216  }
   217  
   218  // verifyKeys verifies keys. If checkEndKey is true, then the end key
   219  // is verified to be non-nil and greater than start key. If
   220  // checkEndKey is false, end key is verified to be nil. Additionally,
   221  // verifies that start key is less than KeyMax and end key is less
   222  // than or equal to KeyMax. It also verifies that a key range that
   223  // contains range-local keys is completely range-local.
   224  func verifyKeys(start, end roachpb.Key, checkEndKey bool) error {
   225  	if bytes.Compare(start, roachpb.KeyMax) >= 0 {
   226  		return errors.Errorf("start key %q must be less than KeyMax", start)
   227  	}
   228  	if !checkEndKey {
   229  		if len(end) != 0 {
   230  			return errors.Errorf("end key %q should not be specified for this operation", end)
   231  		}
   232  		return nil
   233  	}
   234  	if end == nil {
   235  		return errors.Errorf("end key must be specified")
   236  	}
   237  	if bytes.Compare(roachpb.KeyMax, end) < 0 {
   238  		return errors.Errorf("end key %q must be less than or equal to KeyMax", end)
   239  	}
   240  	{
   241  		sAddr, err := keys.Addr(start)
   242  		if err != nil {
   243  			return err
   244  		}
   245  		eAddr, err := keys.Addr(end)
   246  		if err != nil {
   247  			return err
   248  		}
   249  		if !sAddr.Less(eAddr) {
   250  			return errors.Errorf("end key %q must be greater than start %q", end, start)
   251  		}
   252  		if !bytes.Equal(sAddr, start) {
   253  			if bytes.Equal(eAddr, end) {
   254  				return errors.Errorf("start key is range-local, but end key is not")
   255  			}
   256  		} else if bytes.Compare(start, keys.LocalMax) < 0 {
   257  			// It's a range op, not local but somehow plows through local data -
   258  			// not cool.
   259  			return errors.Errorf("start key in [%q,%q) must be greater than LocalMax", start, end)
   260  		}
   261  	}
   262  
   263  	return nil
   264  }
   265  
   266  // rangeKeyItem is a common interface for roachpb.Key and Range.
   267  type rangeKeyItem interface {
   268  	startKey() roachpb.RKey
   269  }
   270  
   271  // rangeBTreeKey is a type alias of roachpb.RKey that implements the
   272  // rangeKeyItem interface and the btree.Item interface.
   273  type rangeBTreeKey roachpb.RKey
   274  
   275  var _ rangeKeyItem = rangeBTreeKey{}
   276  
   277  func (k rangeBTreeKey) startKey() roachpb.RKey {
   278  	return (roachpb.RKey)(k)
   279  }
   280  
   281  var _ btree.Item = rangeBTreeKey{}
   282  
   283  func (k rangeBTreeKey) Less(i btree.Item) bool {
   284  	return k.startKey().Less(i.(rangeKeyItem).startKey())
   285  }
   286  
   287  // A NotBootstrappedError indicates that an engine has not yet been
   288  // bootstrapped due to a store identifier not being present.
   289  type NotBootstrappedError struct{}
   290  
   291  // Error formats error.
   292  func (e *NotBootstrappedError) Error() string {
   293  	return "store has not been bootstrapped"
   294  }
   295  
   296  // A storeReplicaVisitor calls a visitor function for each of a store's
   297  // initialized Replicas (in unspecified order). It provides an option
   298  // to visit replicas in increasing RangeID order.
   299  type storeReplicaVisitor struct {
   300  	store   *Store
   301  	repls   []*Replica // Replicas to be visited
   302  	ordered bool       // Option to visit replicas in sorted order
   303  	visited int        // Number of visited ranges, -1 before first call to Visit()
   304  }
   305  
   306  // Len implements sort.Interface.
   307  func (rs storeReplicaVisitor) Len() int { return len(rs.repls) }
   308  
   309  // Less implements sort.Interface.
   310  func (rs storeReplicaVisitor) Less(i, j int) bool { return rs.repls[i].RangeID < rs.repls[j].RangeID }
   311  
   312  // Swap implements sort.Interface.
   313  func (rs storeReplicaVisitor) Swap(i, j int) { rs.repls[i], rs.repls[j] = rs.repls[j], rs.repls[i] }
   314  
   315  // newStoreReplicaVisitor constructs a storeReplicaVisitor.
   316  func newStoreReplicaVisitor(store *Store) *storeReplicaVisitor {
   317  	return &storeReplicaVisitor{
   318  		store:   store,
   319  		visited: -1,
   320  	}
   321  }
   322  
   323  // InOrder tells the visitor to visit replicas in increasing RangeID order.
   324  func (rs *storeReplicaVisitor) InOrder() *storeReplicaVisitor {
   325  	rs.ordered = true
   326  	return rs
   327  }
   328  
   329  // Visit calls the visitor with each Replica until false is returned.
   330  func (rs *storeReplicaVisitor) Visit(visitor func(*Replica) bool) {
   331  	// Copy the range IDs to a slice so that we iterate over some (possibly
   332  	// stale) view of all Replicas without holding the Store lock. In particular,
   333  	// no locks are acquired during the copy process.
   334  	rs.repls = nil
   335  	rs.store.mu.replicas.Range(func(k int64, v unsafe.Pointer) bool {
   336  		rs.repls = append(rs.repls, (*Replica)(v))
   337  		return true
   338  	})
   339  
   340  	if rs.ordered {
   341  		// If the replicas were requested in sorted order, perform the sort.
   342  		sort.Sort(rs)
   343  	} else {
   344  		// The Replicas are already in "unspecified order" due to map iteration,
   345  		// but we want to make sure it's completely random to prevent issues in
   346  		// tests where stores are scanning replicas in lock-step and one store is
   347  		// winning the race and getting a first crack at processing the replicas on
   348  		// its queues.
   349  		//
   350  		// TODO(peter): Re-evaluate whether this is necessary after we allow
   351  		// rebalancing away from the leaseholder. See TestRebalance_3To5Small.
   352  		shuffle.Shuffle(rs)
   353  	}
   354  
   355  	rs.visited = 0
   356  	for _, repl := range rs.repls {
   357  		// TODO(tschottdorf): let the visitor figure out if something's been
   358  		// destroyed once we return errors from mutexes (#9190). After all, it
   359  		// can still happen with this code.
   360  		rs.visited++
   361  		repl.mu.RLock()
   362  		destroyed := repl.mu.destroyStatus
   363  		initialized := repl.isInitializedRLocked()
   364  		repl.mu.RUnlock()
   365  		if initialized && destroyed.IsAlive() && !visitor(repl) {
   366  			break
   367  		}
   368  	}
   369  	rs.visited = 0
   370  }
   371  
   372  // EstimatedCount returns an estimated count of the underlying store's
   373  // replicas.
   374  //
   375  // TODO(tschottdorf): this method has highly doubtful semantics.
   376  func (rs *storeReplicaVisitor) EstimatedCount() int {
   377  	if rs.visited <= 0 {
   378  		return rs.store.ReplicaCount()
   379  	}
   380  	return len(rs.repls) - rs.visited
   381  }
   382  
   383  // A Store maintains a map of ranges by start key. A Store corresponds
   384  // to one physical device.
   385  type Store struct {
   386  	Ident              *roachpb.StoreIdent // pointer to catch access before Start() is called
   387  	cfg                StoreConfig
   388  	db                 *kv.DB
   389  	engine             storage.Engine       // The underlying key-value store
   390  	compactor          *compactor.Compactor // Schedules compaction of the engine
   391  	tsCache            tscache.Cache        // Most recent timestamps for keys / key ranges
   392  	allocator          Allocator            // Makes allocation decisions
   393  	replRankings       *replicaRankings
   394  	storeRebalancer    *StoreRebalancer
   395  	rangeIDAlloc       *idalloc.Allocator          // Range ID allocator
   396  	gcQueue            *gcQueue                    // Garbage collection queue
   397  	mergeQueue         *mergeQueue                 // Range merging queue
   398  	splitQueue         *splitQueue                 // Range splitting queue
   399  	replicateQueue     *replicateQueue             // Replication queue
   400  	replicaGCQueue     *replicaGCQueue             // Replica GC queue
   401  	raftLogQueue       *raftLogQueue               // Raft log truncation queue
   402  	raftSnapshotQueue  *raftSnapshotQueue          // Raft repair queue
   403  	tsMaintenanceQueue *timeSeriesMaintenanceQueue // Time series maintenance queue
   404  	scanner            *replicaScanner             // Replica scanner
   405  	consistencyQueue   *consistencyQueue           // Replica consistency check queue
   406  	metrics            *StoreMetrics
   407  	intentResolver     *intentresolver.IntentResolver
   408  	recoveryMgr        txnrecovery.Manager
   409  	raftEntryCache     *raftentry.Cache
   410  	limiters           batcheval.Limiters
   411  	txnWaitMetrics     *txnwait.Metrics
   412  	sstSnapshotStorage SSTSnapshotStorage
   413  	protectedtsCache   protectedts.Cache
   414  
   415  	// gossipRangeCountdown and leaseRangeCountdown are countdowns of
   416  	// changes to range and leaseholder counts, after which the store
   417  	// descriptor will be re-gossiped earlier than the normal periodic
   418  	// gossip interval. Updated atomically.
   419  	gossipRangeCountdown int32
   420  	gossipLeaseCountdown int32
   421  	// gossipQueriesPerSecondVal and gossipWritesPerSecond serve similar
   422  	// purposes, but simply record the most recently gossiped value so that we
   423  	// can tell if a newly measured value differs by enough to justify
   424  	// re-gossiping the store.
   425  	gossipQueriesPerSecondVal syncutil.AtomicFloat64
   426  	gossipWritesPerSecondVal  syncutil.AtomicFloat64
   427  
   428  	coalescedMu struct {
   429  		syncutil.Mutex
   430  		heartbeats         map[roachpb.StoreIdent][]RaftHeartbeat
   431  		heartbeatResponses map[roachpb.StoreIdent][]RaftHeartbeat
   432  	}
   433  	// 1 if the store was started, 0 if it wasn't. To be accessed using atomic
   434  	// ops.
   435  	started int32
   436  	stopper *stop.Stopper
   437  	// The time when the store was Start()ed, in nanos.
   438  	startedAt    int64
   439  	nodeDesc     *roachpb.NodeDescriptor
   440  	initComplete sync.WaitGroup // Signaled by async init tasks
   441  
   442  	// Semaphore to limit concurrent non-empty snapshot application.
   443  	snapshotApplySem chan struct{}
   444  
   445  	// Track newly-acquired expiration-based leases that we want to proactively
   446  	// renew. An object is sent on the signal whenever a new entry is added to
   447  	// the map.
   448  	renewableLeases       syncutil.IntMap // map[roachpb.RangeID]*Replica
   449  	renewableLeasesSignal chan struct{}
   450  
   451  	// draining holds a bool which indicates whether this store is draining. See
   452  	// SetDraining() for a more detailed explanation of behavior changes.
   453  	//
   454  	// TODO(bdarnell,tschottdorf): Would look better inside of `mu`, which at
   455  	// the time of its creation was riddled with deadlock (but that situation
   456  	// has likely improved).
   457  	draining atomic.Value
   458  
   459  	// Locking notes: To avoid deadlocks, the following lock order must be
   460  	// obeyed: baseQueue.mu < Replica.raftMu < Replica.readOnlyCmdMu < Store.mu
   461  	// < Replica.mu < Replica.unreachablesMu < Store.coalescedMu < Store.scheduler.mu.
   462  	// (It is not required to acquire every lock in sequence, but when multiple
   463  	// locks are held at the same time, it is incorrect to acquire a lock with
   464  	// "lesser" value in this sequence after one with "greater" value).
   465  	//
   466  	// Methods of Store with a "Locked" suffix require that
   467  	// Store.mu.Mutex be held. Other locking requirements are indicated
   468  	// in comments.
   469  	//
   470  	// The locking structure here is complex because A) Store is a
   471  	// container of Replicas, so it must generally be consulted before
   472  	// doing anything with any Replica, B) some Replica operations
   473  	// (including splits) modify the Store. Therefore we generally lock
   474  	// Store.mu to find a Replica, release it, then call a method on the
   475  	// Replica. These short-lived locks of Store.mu and Replica.mu are
   476  	// often surrounded by a long-lived lock of Replica.raftMu as
   477  	// described below.
   478  	//
   479  	// There are two major entry points to this stack of locks:
   480  	// Store.Send (which handles incoming RPCs) and raft-related message
   481  	// processing (including handleRaftReady on the processRaft
   482  	// goroutine and HandleRaftRequest on GRPC goroutines). Reads are
   483  	// processed solely through Store.Send; writes start out on
   484  	// Store.Send until they propose their raft command and then they
   485  	// finish on the raft goroutines.
   486  	//
   487  	// TODO(bdarnell): a Replica could be destroyed immediately after
   488  	// Store.Send finds the Replica and releases the lock. We need
   489  	// another RWMutex to be held by anything using a Replica to ensure
   490  	// that everything is finished before releasing it. #7169
   491  	//
   492  	// Detailed description of the locks:
   493  	//
   494  	// * Replica.raftMu: Held while any raft messages are being processed
   495  	//   (including handleRaftReady and HandleRaftRequest) or while the set of
   496  	//   Replicas in the Store is being changed (which may happen outside of raft
   497  	//   via the replica GC queue).
   498  	//
   499  	//   If holding raftMus for multiple different replicas simultaneously,
   500  	//   acquire the locks in the order that the replicas appear in replicasByKey.
   501  	//
   502  	// * Replica.readOnlyCmdMu (RWMutex): Held in read mode while any
   503  	//   read-only command is in progress on the replica; held in write
   504  	//   mode while executing a commit trigger. This is necessary
   505  	//   because read-only commands mutate the Replica's timestamp cache
   506  	//   (while holding Replica.mu in addition to readOnlyCmdMu). The
   507  	//   RWMutex ensures that no reads are being executed during a split
   508  	//   (which copies the timestamp cache) while still allowing
   509  	//   multiple reads in parallel (#3148). TODO(bdarnell): this lock
   510  	//   only needs to be held during splitTrigger, not all triggers.
   511  	//
   512  	// * baseQueue.mu: The mutex contained in each of the store's queues (such
   513  	//   as the replicate queue, replica GC queue, GC queue, ...). The mutex is
   514  	//   typically acquired when deciding whether to add a replica to the respective
   515  	//   queue.
   516  	//
   517  	// * Store.mu: Protects the Store's map of its Replicas. Acquired and
   518  	//   released briefly at the start of each request; metadata operations like
   519  	//   splits acquire it again to update the map. Even though these lock
   520  	//   acquisitions do not make up a single critical section, it is safe thanks
   521  	//   to Replica.raftMu which prevents any concurrent modifications.
   522  	//
   523  	// * Replica.mu: Protects the Replica's in-memory state. Acquired
   524  	//   and released briefly as needed (note that while the lock is
   525  	//   held "briefly" in that it is not held for an entire request, we
   526  	//   do sometimes do I/O while holding the lock, as in
   527  	//   Replica.Entries). This lock should be held when calling any
   528  	//   methods on the raft group. Raft may call back into the Replica
   529  	//   via the methods of the raft.Storage interface, which assume the
   530  	//   lock is held even though they do not follow our convention of
   531  	//   the "Locked" suffix.
   532  	//
   533  	// * Store.scheduler.mu: Protects the Raft scheduler internal
   534  	//   state. Callbacks from the scheduler are performed while not holding this
   535  	//   mutex in order to observe the above ordering constraints.
   536  	//
   537  	// Splits and merges deserve special consideration: they operate on two
   538  	// ranges. For splits, this might seem fine because the right-hand range is
   539  	// brand new, but an uninitialized version may have been created by a raft
   540  	// message before we process the split (see commentary on
   541  	// Replica.splitTrigger). We make this safe, for both splits and merges, by
   542  	// locking the right-hand range for the duration of the Raft command
   543  	// containing the split/merge trigger.
   544  	//
   545  	// Note that because we acquire and release Store.mu and Replica.mu
   546  	// repeatedly rather than holding a lock for an entire request, we are
   547  	// actually relying on higher-level locks to ensure that things don't change
   548  	// out from under us. In particular, handleRaftReady accesses the replicaID
   549  	// more than once, and we rely on Replica.raftMu to ensure that this is not
   550  	// modified by a concurrent HandleRaftRequest. (#4476)
   551  
   552  	mu struct {
   553  		syncutil.RWMutex
   554  		// Map of replicas by Range ID (map[roachpb.RangeID]*Replica). This
   555  		// includes `uninitReplicas`. May be read without holding Store.mu.
   556  		replicas syncutil.IntMap
   557  		// A btree key containing objects of type *Replica or *ReplicaPlaceholder.
   558  		// Both types have an associated key range; the btree is keyed on their
   559  		// start keys.
   560  		replicasByKey  *btree.BTree
   561  		uninitReplicas map[roachpb.RangeID]*Replica // Map of uninitialized replicas by Range ID
   562  		// replicaPlaceholders is a map to access all placeholders, so they can
   563  		// be directly accessed and cleared after stepping all raft groups. This
   564  		// is always in sync with the placeholders in replicasByKey.
   565  		replicaPlaceholders map[roachpb.RangeID]*ReplicaPlaceholder
   566  	}
   567  
   568  	// The unquiesced subset of replicas.
   569  	unquiescedReplicas struct {
   570  		syncutil.Mutex
   571  		m map[roachpb.RangeID]struct{}
   572  	}
   573  
   574  	// The subset of replicas with active rangefeeds.
   575  	rangefeedReplicas struct {
   576  		syncutil.Mutex
   577  		m map[roachpb.RangeID]struct{}
   578  	}
   579  
   580  	// replicaQueues is a map of per-Replica incoming request queues. These
   581  	// queues might more naturally belong in Replica, but are kept separate to
   582  	// avoid reworking the locking in getOrCreateReplica which requires
   583  	// Replica.raftMu to be held while a replica is being inserted into
   584  	// Store.mu.replicas.
   585  	replicaQueues syncutil.IntMap // map[roachpb.RangeID]*raftRequestQueue
   586  
   587  	scheduler *raftScheduler
   588  
   589  	// livenessMap is a map from nodeID to a bool indicating
   590  	// liveness. It is updated periodically in raftTickLoop().
   591  	livenessMap atomic.Value
   592  
   593  	// cachedCapacity caches information on store capacity to prevent
   594  	// expensive recomputations in case leases or replicas are rapidly
   595  	// rebalancing.
   596  	cachedCapacity struct {
   597  		syncutil.Mutex
   598  		roachpb.StoreCapacity
   599  	}
   600  
   601  	counts struct {
   602  		// Number of placeholders removed due to error.
   603  		removedPlaceholders int32
   604  		// Number of placeholders successfully filled by a snapshot.
   605  		filledPlaceholders int32
   606  		// Number of placeholders removed due to a snapshot that was dropped by
   607  		// raft.
   608  		droppedPlaceholders int32
   609  	}
   610  
   611  	computeInitialMetrics sync.Once
   612  }
   613  
   614  var _ kv.Sender = &Store{}
   615  
   616  // A StoreConfig encompasses the auxiliary objects and configuration
   617  // required to create a store.
   618  // All fields holding a pointer or an interface are required to create
   619  // a store; the rest will have sane defaults set if omitted.
   620  type StoreConfig struct {
   621  	AmbientCtx log.AmbientContext
   622  	base.RaftConfig
   623  
   624  	DefaultZoneConfig       *zonepb.ZoneConfig
   625  	DefaultSystemZoneConfig *zonepb.ZoneConfig
   626  	Settings                *cluster.Settings
   627  	Clock                   *hlc.Clock
   628  	DB                      *kv.DB
   629  	Gossip                  *gossip.Gossip
   630  	NodeLiveness            *NodeLiveness
   631  	StorePool               *StorePool
   632  	Transport               *RaftTransport
   633  	NodeDialer              *nodedialer.Dialer
   634  	RPCContext              *rpc.Context
   635  	RangeDescriptorCache    kvbase.RangeDescriptorCache
   636  
   637  	ClosedTimestamp *container.Container
   638  
   639  	// SQLExecutor is used by the store to execute SQL statements.
   640  	SQLExecutor sqlutil.InternalExecutor
   641  
   642  	// TimeSeriesDataStore is an interface used by the store's time series
   643  	// maintenance queue to dispatch individual maintenance tasks.
   644  	TimeSeriesDataStore TimeSeriesDataStore
   645  
   646  	// CoalescedHeartbeatsInterval is the interval for which heartbeat messages
   647  	// are queued and then sent as a single coalesced heartbeat; it is a
   648  	// fraction of the RaftTickInterval so that heartbeats don't get delayed by
   649  	// an entire tick. Delaying coalescing heartbeat responses has a bad
   650  	// interaction with quiescence because the coalesced (delayed) heartbeat
   651  	// response can unquiesce the leader. Consider:
   652  	//
   653  	// T+0: leader queues MsgHeartbeat
   654  	// T+1: leader sends MsgHeartbeat
   655  	//                                        follower receives MsgHeartbeat
   656  	//                                        follower queues MsgHeartbeatResp
   657  	// T+2: leader queues quiesce message
   658  	//                                        follower sends MsgHeartbeatResp
   659  	//      leader receives MsgHeartbeatResp
   660  	// T+3: leader sends quiesce message
   661  	//
   662  	// Thus we want to make sure that heartbeats are responded to faster than
   663  	// the quiesce cadence.
   664  	CoalescedHeartbeatsInterval time.Duration
   665  
   666  	// RaftHeartbeatIntervalTicks is the number of ticks that pass between heartbeats.
   667  	RaftHeartbeatIntervalTicks int
   668  
   669  	// ScanInterval is the default value for the scan interval
   670  	ScanInterval time.Duration
   671  
   672  	// ScanMinIdleTime is the minimum time the scanner will be idle between ranges.
   673  	// If enabled (> 0), the scanner may complete in more than ScanInterval for
   674  	// stores with many ranges.
   675  	ScanMinIdleTime time.Duration
   676  
   677  	// ScanMaxIdleTime is the maximum time the scanner will be idle between ranges.
   678  	// If enabled (> 0), the scanner may complete in less than ScanInterval for small
   679  	// stores.
   680  	ScanMaxIdleTime time.Duration
   681  
   682  	// If LogRangeEvents is true, major changes to ranges will be logged into
   683  	// the range event log.
   684  	LogRangeEvents bool
   685  
   686  	// RaftEntryCacheSize is the size in bytes of the Raft log entry cache
   687  	// shared by all Raft groups managed by the store.
   688  	RaftEntryCacheSize uint64
   689  
   690  	// IntentResolverTaskLimit is the maximum number of asynchronous tasks that
   691  	// may be started by the intent resolver. -1 indicates no asynchronous tasks
   692  	// are allowed. 0 uses the default value (defaultIntentResolverTaskLimit)
   693  	// which is non-zero.
   694  	IntentResolverTaskLimit int
   695  
   696  	TestingKnobs StoreTestingKnobs
   697  
   698  	// concurrentSnapshotApplyLimit specifies the maximum number of empty
   699  	// snapshots and the maximum number of non-empty snapshots that are permitted
   700  	// to be applied concurrently.
   701  	concurrentSnapshotApplyLimit int
   702  
   703  	// HistogramWindowInterval is (server.Config).HistogramWindowInterval
   704  	HistogramWindowInterval time.Duration
   705  
   706  	// EnableEpochRangeLeases controls whether epoch-based range leases are used.
   707  	EnableEpochRangeLeases bool
   708  
   709  	// GossipWhenCapacityDeltaExceedsFraction specifies the fraction from the last
   710  	// gossiped store capacity values which need be exceeded before the store will
   711  	// gossip immediately without waiting for the periodic gossip interval.
   712  	GossipWhenCapacityDeltaExceedsFraction float64
   713  
   714  	// ExternalStorage creates ExternalStorage objects which allows access to external files
   715  	ExternalStorage        cloud.ExternalStorageFactory
   716  	ExternalStorageFromURI cloud.ExternalStorageFromURIFactory
   717  
   718  	// ProtectedTimestampCache maintains the state of the protected timestamp
   719  	// subsystem. It is queried during the GC process and in the handling of
   720  	// AdminVerifyProtectedTimestampRequest.
   721  	ProtectedTimestampCache protectedts.Cache
   722  }
   723  
   724  // ConsistencyTestingKnobs is a BatchEvalTestingKnobs struct used to control the
   725  // behavior of the consistency checker for tests.
   726  type ConsistencyTestingKnobs struct {
   727  	// If non-nil, OnBadChecksumFatal is called by CheckConsistency() (instead of
   728  	// calling log.Fatal) on a checksum mismatch.
   729  	OnBadChecksumFatal func(roachpb.StoreIdent)
   730  	// If non-nil, BadChecksumReportDiff is called by CheckConsistency() on a
   731  	// checksum mismatch to report the diff between snapshots.
   732  	BadChecksumReportDiff      func(roachpb.StoreIdent, ReplicaSnapshotDiffSlice)
   733  	ConsistencyQueueResultHook func(response roachpb.CheckConsistencyResponse)
   734  }
   735  
   736  // Valid returns true if the StoreConfig is populated correctly.
   737  // We don't check for Gossip and DB since some of our tests pass
   738  // that as nil.
   739  func (sc *StoreConfig) Valid() bool {
   740  	return sc.Clock != nil && sc.Transport != nil &&
   741  		sc.RaftTickInterval != 0 && sc.RaftHeartbeatIntervalTicks > 0 &&
   742  		sc.RaftElectionTimeoutTicks > 0 && sc.ScanInterval >= 0 &&
   743  		sc.AmbientCtx.Tracer != nil
   744  }
   745  
   746  // SetDefaults initializes unset fields in StoreConfig to values
   747  // suitable for use on a local network.
   748  // TODO(tschottdorf): see if this ought to be configurable via flags.
   749  func (sc *StoreConfig) SetDefaults() {
   750  	sc.RaftConfig.SetDefaults()
   751  
   752  	if sc.CoalescedHeartbeatsInterval == 0 {
   753  		sc.CoalescedHeartbeatsInterval = sc.RaftTickInterval / 2
   754  	}
   755  	if sc.RaftHeartbeatIntervalTicks == 0 {
   756  		sc.RaftHeartbeatIntervalTicks = defaultRaftHeartbeatIntervalTicks
   757  	}
   758  	if sc.RaftEntryCacheSize == 0 {
   759  		sc.RaftEntryCacheSize = defaultRaftEntryCacheSize
   760  	}
   761  	if sc.concurrentSnapshotApplyLimit == 0 {
   762  		// NB: setting this value higher than 1 is likely to degrade client
   763  		// throughput.
   764  		sc.concurrentSnapshotApplyLimit =
   765  			envutil.EnvOrDefaultInt("COCKROACH_CONCURRENT_SNAPSHOT_APPLY_LIMIT", 1)
   766  	}
   767  
   768  	if sc.GossipWhenCapacityDeltaExceedsFraction == 0 {
   769  		sc.GossipWhenCapacityDeltaExceedsFraction = defaultGossipWhenCapacityDeltaExceedsFraction
   770  	}
   771  }
   772  
   773  // LeaseExpiration returns an int64 to increment a manual clock with to
   774  // make sure that all active range leases expire.
   775  func (sc *StoreConfig) LeaseExpiration() int64 {
   776  	// Due to lease extensions, the remaining interval can be longer than just
   777  	// the sum of the offset (=length of stasis period) and the active
   778  	// duration, but definitely not by 2x.
   779  	maxOffset := sc.Clock.MaxOffset()
   780  	return 2 * (sc.RangeLeaseActiveDuration() + maxOffset).Nanoseconds()
   781  }
   782  
   783  // NewStore returns a new instance of a store.
   784  func NewStore(
   785  	ctx context.Context, cfg StoreConfig, eng storage.Engine, nodeDesc *roachpb.NodeDescriptor,
   786  ) *Store {
   787  	// TODO(tschottdorf): find better place to set these defaults.
   788  	cfg.SetDefaults()
   789  
   790  	if !cfg.Valid() {
   791  		log.Fatalf(ctx, "invalid store configuration: %+v", &cfg)
   792  	}
   793  	s := &Store{
   794  		cfg:      cfg,
   795  		db:       cfg.DB, // TODO(tschottdorf): remove redundancy.
   796  		engine:   eng,
   797  		nodeDesc: nodeDesc,
   798  		metrics:  newStoreMetrics(cfg.HistogramWindowInterval),
   799  	}
   800  	if cfg.RPCContext != nil {
   801  		s.allocator = MakeAllocator(cfg.StorePool, cfg.RPCContext.RemoteClocks.Latency)
   802  	} else {
   803  		s.allocator = MakeAllocator(cfg.StorePool, func(string) (time.Duration, bool) {
   804  			return 0, false
   805  		})
   806  	}
   807  	s.replRankings = newReplicaRankings()
   808  
   809  	s.draining.Store(false)
   810  	s.scheduler = newRaftScheduler(s.metrics, s, storeSchedulerConcurrency)
   811  
   812  	s.raftEntryCache = raftentry.NewCache(cfg.RaftEntryCacheSize)
   813  	s.metrics.registry.AddMetricStruct(s.raftEntryCache.Metrics())
   814  
   815  	s.coalescedMu.Lock()
   816  	s.coalescedMu.heartbeats = map[roachpb.StoreIdent][]RaftHeartbeat{}
   817  	s.coalescedMu.heartbeatResponses = map[roachpb.StoreIdent][]RaftHeartbeat{}
   818  	s.coalescedMu.Unlock()
   819  
   820  	s.mu.Lock()
   821  	s.mu.replicaPlaceholders = map[roachpb.RangeID]*ReplicaPlaceholder{}
   822  	s.mu.replicasByKey = btree.New(64 /* degree */)
   823  	s.mu.uninitReplicas = map[roachpb.RangeID]*Replica{}
   824  	s.mu.Unlock()
   825  
   826  	s.unquiescedReplicas.Lock()
   827  	s.unquiescedReplicas.m = map[roachpb.RangeID]struct{}{}
   828  	s.unquiescedReplicas.Unlock()
   829  
   830  	s.rangefeedReplicas.Lock()
   831  	s.rangefeedReplicas.m = map[roachpb.RangeID]struct{}{}
   832  	s.rangefeedReplicas.Unlock()
   833  
   834  	s.tsCache = tscache.New(cfg.Clock)
   835  	s.metrics.registry.AddMetricStruct(s.tsCache.Metrics())
   836  
   837  	s.txnWaitMetrics = txnwait.NewMetrics(cfg.HistogramWindowInterval)
   838  	s.metrics.registry.AddMetricStruct(s.txnWaitMetrics)
   839  
   840  	s.compactor = compactor.NewCompactor(
   841  		s.cfg.Settings,
   842  		s.engine,
   843  		func() (roachpb.StoreCapacity, error) {
   844  			return s.Capacity(false /* useCached */)
   845  		},
   846  		func(ctx context.Context) {
   847  			s.asyncGossipStore(ctx, "compactor-initiated rocksdb compaction", false /* useCached */)
   848  		},
   849  	)
   850  	s.metrics.registry.AddMetricStruct(s.compactor.Metrics)
   851  
   852  	s.snapshotApplySem = make(chan struct{}, cfg.concurrentSnapshotApplyLimit)
   853  
   854  	s.renewableLeasesSignal = make(chan struct{})
   855  
   856  	s.limiters.BulkIOWriteRate = rate.NewLimiter(rate.Limit(bulkIOWriteLimit.Get(&cfg.Settings.SV)), bulkIOWriteBurst)
   857  	bulkIOWriteLimit.SetOnChange(&cfg.Settings.SV, func() {
   858  		s.limiters.BulkIOWriteRate.SetLimit(rate.Limit(bulkIOWriteLimit.Get(&cfg.Settings.SV)))
   859  	})
   860  	s.limiters.ConcurrentImportRequests = limit.MakeConcurrentRequestLimiter(
   861  		"importRequestLimiter", int(importRequestsLimit.Get(&cfg.Settings.SV)),
   862  	)
   863  	importRequestsLimit.SetOnChange(&cfg.Settings.SV, func() {
   864  		s.limiters.ConcurrentImportRequests.SetLimit(int(importRequestsLimit.Get(&cfg.Settings.SV)))
   865  	})
   866  	s.limiters.ConcurrentExportRequests = limit.MakeConcurrentRequestLimiter(
   867  		"exportRequestLimiter", int(ExportRequestsLimit.Get(&cfg.Settings.SV)),
   868  	)
   869  
   870  	// The snapshot storage is usually empty at this point since it is cleared
   871  	// after each snapshot application, except when the node crashed right before
   872  	// it can clean it up. If this fails it's not a correctness issue since the
   873  	// storage is also cleared before receiving a snapshot.
   874  	s.sstSnapshotStorage = NewSSTSnapshotStorage(s.engine, s.limiters.BulkIOWriteRate)
   875  	if err := s.sstSnapshotStorage.Clear(); err != nil {
   876  		log.Warningf(ctx, "failed to clear snapshot storage: %v", err)
   877  	}
   878  	s.protectedtsCache = cfg.ProtectedTimestampCache
   879  
   880  	// On low-CPU instances, a default limit value may still allow ExportRequests
   881  	// to tie up all cores so cap limiter at cores-1 when setting value is higher.
   882  	exportCores := runtime.NumCPU() - 1
   883  	if exportCores < 1 {
   884  		exportCores = 1
   885  	}
   886  	ExportRequestsLimit.SetOnChange(&cfg.Settings.SV, func() {
   887  		limit := int(ExportRequestsLimit.Get(&cfg.Settings.SV))
   888  		if limit > exportCores {
   889  			limit = exportCores
   890  		}
   891  		s.limiters.ConcurrentExportRequests.SetLimit(limit)
   892  	})
   893  	s.limiters.ConcurrentAddSSTableRequests = limit.MakeConcurrentRequestLimiter(
   894  		"addSSTableRequestLimiter", int(addSSTableRequestLimit.Get(&cfg.Settings.SV)),
   895  	)
   896  	addSSTableRequestLimit.SetOnChange(&cfg.Settings.SV, func() {
   897  		s.limiters.ConcurrentAddSSTableRequests.SetLimit(int(addSSTableRequestLimit.Get(&cfg.Settings.SV)))
   898  	})
   899  	s.limiters.ConcurrentRangefeedIters = limit.MakeConcurrentRequestLimiter(
   900  		"rangefeedIterLimiter", int(concurrentRangefeedItersLimit.Get(&cfg.Settings.SV)),
   901  	)
   902  	concurrentRangefeedItersLimit.SetOnChange(&cfg.Settings.SV, func() {
   903  		s.limiters.ConcurrentRangefeedIters.SetLimit(
   904  			int(concurrentRangefeedItersLimit.Get(&cfg.Settings.SV)))
   905  	})
   906  
   907  	if s.cfg.Gossip != nil {
   908  		// Add range scanner and configure with queues.
   909  		s.scanner = newReplicaScanner(
   910  			s.cfg.AmbientCtx, s.cfg.Clock, cfg.ScanInterval,
   911  			cfg.ScanMinIdleTime, cfg.ScanMaxIdleTime, newStoreReplicaVisitor(s),
   912  		)
   913  		s.gcQueue = newGCQueue(s, s.cfg.Gossip)
   914  		s.mergeQueue = newMergeQueue(s, s.db, s.cfg.Gossip)
   915  		s.splitQueue = newSplitQueue(s, s.db, s.cfg.Gossip)
   916  		s.replicateQueue = newReplicateQueue(s, s.cfg.Gossip, s.allocator)
   917  		s.replicaGCQueue = newReplicaGCQueue(s, s.db, s.cfg.Gossip)
   918  		s.raftLogQueue = newRaftLogQueue(s, s.db, s.cfg.Gossip)
   919  		s.raftSnapshotQueue = newRaftSnapshotQueue(s, s.cfg.Gossip)
   920  		s.consistencyQueue = newConsistencyQueue(s, s.cfg.Gossip)
   921  		// NOTE: If more queue types are added, please also add them to the list of
   922  		// queues on the EnqueueRange debug page as defined in
   923  		// pkg/ui/src/views/reports/containers/enqueueRange/index.tsx
   924  		s.scanner.AddQueues(
   925  			s.gcQueue, s.mergeQueue, s.splitQueue, s.replicateQueue, s.replicaGCQueue,
   926  			s.raftLogQueue, s.raftSnapshotQueue, s.consistencyQueue)
   927  
   928  		if s.cfg.TimeSeriesDataStore != nil {
   929  			s.tsMaintenanceQueue = newTimeSeriesMaintenanceQueue(
   930  				s, s.db, s.cfg.Gossip, s.cfg.TimeSeriesDataStore,
   931  			)
   932  			s.scanner.AddQueues(s.tsMaintenanceQueue)
   933  		}
   934  	}
   935  
   936  	if cfg.TestingKnobs.DisableGCQueue {
   937  		s.setGCQueueActive(false)
   938  	}
   939  	if cfg.TestingKnobs.DisableMergeQueue {
   940  		s.setMergeQueueActive(false)
   941  	}
   942  	if cfg.TestingKnobs.DisableRaftLogQueue {
   943  		s.setRaftLogQueueActive(false)
   944  	}
   945  	if cfg.TestingKnobs.DisableReplicaGCQueue {
   946  		s.setReplicaGCQueueActive(false)
   947  	}
   948  	if cfg.TestingKnobs.DisableReplicateQueue {
   949  		s.SetReplicateQueueActive(false)
   950  	}
   951  	if cfg.TestingKnobs.DisableSplitQueue {
   952  		s.setSplitQueueActive(false)
   953  	}
   954  	if cfg.TestingKnobs.DisableTimeSeriesMaintenanceQueue {
   955  		s.setTimeSeriesMaintenanceQueueActive(false)
   956  	}
   957  	if cfg.TestingKnobs.DisableRaftSnapshotQueue {
   958  		s.setRaftSnapshotQueueActive(false)
   959  	}
   960  	if cfg.TestingKnobs.DisableConsistencyQueue {
   961  		s.setConsistencyQueueActive(false)
   962  	}
   963  	if cfg.TestingKnobs.DisableScanner {
   964  		s.setScannerActive(false)
   965  	}
   966  
   967  	return s
   968  }
   969  
   970  // String formats a store for debug output.
   971  func (s *Store) String() string {
   972  	return fmt.Sprintf("[n%d,s%d]", s.Ident.NodeID, s.Ident.StoreID)
   973  }
   974  
   975  // ClusterSettings returns the node's ClusterSettings.
   976  func (s *Store) ClusterSettings() *cluster.Settings {
   977  	return s.cfg.Settings
   978  }
   979  
   980  // AnnotateCtx is a convenience wrapper; see AmbientContext.
   981  func (s *Store) AnnotateCtx(ctx context.Context) context.Context {
   982  	return s.cfg.AmbientCtx.AnnotateCtx(ctx)
   983  }
   984  
   985  // SetDraining (when called with 'true') causes incoming lease transfers to be
   986  // rejected, prevents all of the Store's Replicas from acquiring or extending
   987  // range leases, and attempts to transfer away any leases owned.
   988  // When called with 'false', returns to the normal mode of operation.
   989  //
   990  // The reporter callback, if non-nil, is called on a best effort basis
   991  // to report work that needed to be done and which may or may not have
   992  // been done by the time this call returns. See the explanation in
   993  // pkg/server/drain.go for details.
   994  func (s *Store) SetDraining(drain bool, reporter func(int, string)) {
   995  	s.draining.Store(drain)
   996  	if !drain {
   997  		newStoreReplicaVisitor(s).Visit(func(r *Replica) bool {
   998  			r.mu.Lock()
   999  			r.mu.draining = false
  1000  			r.mu.Unlock()
  1001  			return true
  1002  		})
  1003  		return
  1004  	}
  1005  
  1006  	baseCtx := logtags.AddTag(context.Background(), "drain", nil)
  1007  
  1008  	// In a running server, the code below (transferAllAway and the loop
  1009  	// that calls it) does not need to be conditional on messaging by
  1010  	// the Stopper. This is because the top level Server calls SetDrain
  1011  	// upon a graceful shutdown, and waits until the SetDrain calls
  1012  	// completes, at which point the work has terminated on its own. If
  1013  	// the top-level server is forcefully shut down, it does not matter
  1014  	// if some of the code below is still running.
  1015  	//
  1016  	// However, the situation is different in unit tests where we also
  1017  	// assert there are no leaking goroutines when a test terminates.
  1018  	// If a test terminates with a timed out lease transfer, it's
  1019  	// possible for the transferAllAway() closure to be still running
  1020  	// when the closer shuts down the test server.
  1021  	//
  1022  	// To prevent this, we add this code here which adds the missing
  1023  	// cancel + wait in the particular case where the stopper is
  1024  	// completing a shutdown while a graceful SetDrain is still ongoing.
  1025  	ctx, cancelFn := s.stopper.WithCancelOnStop(baseCtx)
  1026  	defer cancelFn()
  1027  
  1028  	var wg sync.WaitGroup
  1029  
  1030  	transferAllAway := func(transferCtx context.Context) int {
  1031  		// Limit the number of concurrent lease transfers.
  1032  		const leaseTransferConcurrency = 100
  1033  		sem := quotapool.NewIntPool("Store.SetDraining", leaseTransferConcurrency)
  1034  
  1035  		// Incremented for every lease or Raft leadership transfer
  1036  		// attempted. We try to send both the lease and the Raft leaders
  1037  		// away, but this may not reliably work. Instead, we run the
  1038  		// surrounding retry loop until there are no leaders/leases left
  1039  		// (ignoring single-replica or uninitialized Raft groups).
  1040  		var numTransfersAttempted int32
  1041  		newStoreReplicaVisitor(s).Visit(func(r *Replica) bool {
  1042  			//
  1043  			// We need to be careful about the case where the ctx has been canceled
  1044  			// prior to the call to (*Stopper).RunLimitedAsyncTask(). In that case,
  1045  			// the goroutine is not even spawned. However, we don't want to
  1046  			// mis-count the missing goroutine as the lack of transfer attempted.
  1047  			// So what we do here is immediately increase numTransfersAttempted
  1048  			// to count this replica, and then decrease it when it is known
  1049  			// below that there is nothing to transfer (not lease holder and
  1050  			// not raft leader).
  1051  			atomic.AddInt32(&numTransfersAttempted, 1)
  1052  			wg.Add(1)
  1053  			if err := s.stopper.RunLimitedAsyncTask(
  1054  				r.AnnotateCtx(ctx), "storage.Store: draining replica", sem, true, /* wait */
  1055  				func(ctx context.Context) {
  1056  					defer wg.Done()
  1057  
  1058  					select {
  1059  					case <-transferCtx.Done():
  1060  						// Context canceled: the timeout loop has decided we've
  1061  						// done enough draining
  1062  						// (server.shutdown.lease_transfer_wait).
  1063  						//
  1064  						// We need this check here because each call of
  1065  						// transferAllAway() traverses all stores/replicas without
  1066  						// checking for the timeout otherwise.
  1067  						if log.V(1) {
  1068  							log.Infof(ctx, "lease transfer aborted due to exceeded timeout")
  1069  						}
  1070  						return
  1071  					default:
  1072  					}
  1073  
  1074  					r.mu.Lock()
  1075  					r.mu.draining = true
  1076  					status := r.raftStatusRLocked()
  1077  					// needsRaftTransfer is true when we can reasonably hope to transfer
  1078  					// this replica's lease and/or Raft leadership away.
  1079  					needsRaftTransfer := status != nil &&
  1080  						len(status.Progress) > 1 &&
  1081  						!(status.RaftState == raft.StateFollower && status.Lead != 0)
  1082  					r.mu.Unlock()
  1083  
  1084  					var drainingLease roachpb.Lease
  1085  					for {
  1086  						var llHandle *leaseRequestHandle
  1087  						r.mu.Lock()
  1088  						lease, nextLease := r.getLeaseRLocked()
  1089  						if nextLease != (roachpb.Lease{}) && nextLease.OwnedBy(s.StoreID()) {
  1090  							llHandle = r.mu.pendingLeaseRequest.JoinRequest()
  1091  						}
  1092  						r.mu.Unlock()
  1093  
  1094  						if llHandle != nil {
  1095  							<-llHandle.C()
  1096  							continue
  1097  						}
  1098  						drainingLease = lease
  1099  						break
  1100  					}
  1101  
  1102  					// Learner replicas aren't allowed to become the leaseholder or raft
  1103  					// leader, so only consider the `Voters` replicas.
  1104  					needsLeaseTransfer := len(r.Desc().Replicas().Voters()) > 1 &&
  1105  						drainingLease.OwnedBy(s.StoreID()) &&
  1106  						r.IsLeaseValid(drainingLease, s.Clock().Now())
  1107  
  1108  					if !needsLeaseTransfer && !needsRaftTransfer {
  1109  						if log.V(1) {
  1110  							// This logging is useful to troubleshoot incomplete drains.
  1111  							log.Info(ctx, "not moving out")
  1112  						}
  1113  						atomic.AddInt32(&numTransfersAttempted, -1)
  1114  						return
  1115  					}
  1116  					if log.V(1) {
  1117  						// This logging is useful to troubleshoot incomplete drains.
  1118  						log.Infof(ctx, "trying to move replica out: lease transfer = %v, raft transfer = %v", needsLeaseTransfer, needsRaftTransfer)
  1119  					}
  1120  
  1121  					if needsLeaseTransfer {
  1122  						desc, zone := r.DescAndZone()
  1123  						leaseTransferred, err := s.replicateQueue.findTargetAndTransferLease(
  1124  							ctx,
  1125  							r,
  1126  							desc,
  1127  							zone,
  1128  							transferLeaseOptions{},
  1129  						)
  1130  						if log.V(1) && !leaseTransferred {
  1131  							// Note that a nil error means that there were no suitable
  1132  							// candidates.
  1133  							log.Errorf(
  1134  								ctx,
  1135  								"did not transfer lease %s for replica %s when draining: %v",
  1136  								drainingLease,
  1137  								desc,
  1138  								err,
  1139  							)
  1140  						}
  1141  						if err == nil && leaseTransferred {
  1142  							// If we just transferred the lease away, Raft leadership will
  1143  							// usually transfer with it. Invoking a separate Raft leadership
  1144  							// transfer would only obstruct this.
  1145  							needsRaftTransfer = false
  1146  						}
  1147  					}
  1148  
  1149  					if needsRaftTransfer {
  1150  						r.raftMu.Lock()
  1151  						r.maybeTransferRaftLeadership(ctx)
  1152  						r.raftMu.Unlock()
  1153  					}
  1154  				}); err != nil {
  1155  				if log.V(1) {
  1156  					log.Errorf(ctx, "error running draining task: %+v", err)
  1157  				}
  1158  				wg.Done()
  1159  				return false
  1160  			}
  1161  			return true
  1162  		})
  1163  		wg.Wait()
  1164  		return int(numTransfersAttempted)
  1165  	}
  1166  
  1167  	// Give all replicas at least one chance to transfer.
  1168  	// If we don't do that, then it's possible that a configured
  1169  	// value for raftLeadershipTransferWait is too low to iterate
  1170  	// through all the replicas at least once, and the drain
  1171  	// condition on the remaining value will never be reached.
  1172  	if numRemaining := transferAllAway(ctx); numRemaining > 0 {
  1173  		// Report progress to the Drain RPC.
  1174  		if reporter != nil {
  1175  			reporter(numRemaining, "range lease iterations")
  1176  		}
  1177  	} else {
  1178  		// No more work to do.
  1179  		return
  1180  	}
  1181  
  1182  	// We've seen all the replicas once. Now we're going to iterate
  1183  	// until they're all gone, up to the configured timeout.
  1184  	transferTimeout := raftLeadershipTransferWait.Get(&s.cfg.Settings.SV)
  1185  
  1186  	if err := contextutil.RunWithTimeout(ctx, "wait for raft leadership transfer", transferTimeout,
  1187  		func(ctx context.Context) error {
  1188  			opts := retry.Options{
  1189  				InitialBackoff: 10 * time.Millisecond,
  1190  				MaxBackoff:     time.Second,
  1191  				Multiplier:     2,
  1192  			}
  1193  			everySecond := log.Every(time.Second)
  1194  			var err error
  1195  			// Avoid retry.ForDuration because of https://github.com/cockroachdb/cockroach/issues/25091.
  1196  			for r := retry.StartWithCtx(ctx, opts); r.Next(); {
  1197  				err = nil
  1198  				if numRemaining := transferAllAway(ctx); numRemaining > 0 {
  1199  					// Report progress to the Drain RPC.
  1200  					if reporter != nil {
  1201  						reporter(numRemaining, "range lease iterations")
  1202  					}
  1203  					err = errors.Errorf("waiting for %d replicas to transfer their lease away", numRemaining)
  1204  					if everySecond.ShouldLog() {
  1205  						log.Infof(ctx, "%v", err)
  1206  					}
  1207  				}
  1208  				if err == nil {
  1209  					// All leases transferred. We can stop retrying.
  1210  					break
  1211  				}
  1212  			}
  1213  			// If there's an error in the context but not yet detected in
  1214  			// err, take it into account here.
  1215  			return errors.CombineErrors(err, ctx.Err())
  1216  		}); err != nil {
  1217  		// You expect this message when shutting down a server in an unhealthy
  1218  		// cluster. If we see it on healthy ones, there's likely something to fix.
  1219  		log.Warningf(ctx, "unable to drain cleanly within %s, service might briefly deteriorate: %+v", transferTimeout, err)
  1220  	}
  1221  }
  1222  
  1223  // IsStarted returns true if the Store has been started.
  1224  func (s *Store) IsStarted() bool {
  1225  	return atomic.LoadInt32(&s.started) == 1
  1226  }
  1227  
  1228  // IterateIDPrefixKeys helps visit system keys that use RangeID prefixing (such
  1229  // as RaftHardStateKey, RangeTombstoneKey, and many others). Such keys could in
  1230  // principle exist at any RangeID, and this helper efficiently discovers all the
  1231  // keys of the desired type (as specified by the supplied `keyFn`) and, for each
  1232  // key-value pair discovered, unmarshals it into `msg` and then invokes `f`.
  1233  //
  1234  // Iteration stops on the first error (and will pass through that error).
  1235  func IterateIDPrefixKeys(
  1236  	ctx context.Context,
  1237  	reader storage.Reader,
  1238  	keyFn func(roachpb.RangeID) roachpb.Key,
  1239  	msg protoutil.Message,
  1240  	f func(_ roachpb.RangeID) (more bool, _ error),
  1241  ) error {
  1242  	rangeID := roachpb.RangeID(1)
  1243  	iter := reader.NewIterator(storage.IterOptions{
  1244  		UpperBound: keys.LocalRangeIDPrefix.PrefixEnd().AsRawKey(),
  1245  	})
  1246  	defer iter.Close()
  1247  
  1248  	for {
  1249  		bumped := false
  1250  		mvccKey := storage.MakeMVCCMetadataKey(keyFn(rangeID))
  1251  		iter.SeekGE(mvccKey)
  1252  
  1253  		if ok, err := iter.Valid(); !ok {
  1254  			return err
  1255  		}
  1256  
  1257  		unsafeKey := iter.UnsafeKey()
  1258  
  1259  		if !bytes.HasPrefix(unsafeKey.Key, keys.LocalRangeIDPrefix) {
  1260  			// Left the local keyspace, so we're done.
  1261  			return nil
  1262  		}
  1263  
  1264  		curRangeID, _, _, _, err := keys.DecodeRangeIDKey(unsafeKey.Key)
  1265  		if err != nil {
  1266  			return err
  1267  		}
  1268  
  1269  		if curRangeID > rangeID {
  1270  			// `bumped` is always `false` here, but let's be explicit.
  1271  			if !bumped {
  1272  				rangeID = curRangeID
  1273  				bumped = true
  1274  			}
  1275  			mvccKey = storage.MakeMVCCMetadataKey(keyFn(rangeID))
  1276  		}
  1277  
  1278  		if !unsafeKey.Key.Equal(mvccKey.Key) {
  1279  			if !bumped {
  1280  				// Don't increment the rangeID if it has already been incremented
  1281  				// above, or we could skip past a value we ought to see.
  1282  				rangeID++
  1283  				bumped = true // for completeness' sake; continuing below anyway
  1284  			}
  1285  			continue
  1286  		}
  1287  
  1288  		ok, err := storage.MVCCGetProto(
  1289  			ctx, reader, unsafeKey.Key, hlc.Timestamp{}, msg, storage.MVCCGetOptions{})
  1290  		if err != nil {
  1291  			return err
  1292  		}
  1293  		if !ok {
  1294  			return errors.Errorf("unable to unmarshal %s into %T", unsafeKey.Key, msg)
  1295  		}
  1296  
  1297  		more, err := f(rangeID)
  1298  		if !more || err != nil {
  1299  			return err
  1300  		}
  1301  		rangeID++
  1302  	}
  1303  }
  1304  
  1305  // IterateRangeDescriptors calls the provided function with each descriptor
  1306  // from the provided Engine. The return values of this method and fn have
  1307  // semantics similar to engine.MVCCIterate.
  1308  func IterateRangeDescriptors(
  1309  	ctx context.Context,
  1310  	reader storage.Reader,
  1311  	fn func(desc roachpb.RangeDescriptor) (done bool, err error),
  1312  ) error {
  1313  	log.Event(ctx, "beginning range descriptor iteration")
  1314  	// Iterator over all range-local key-based data.
  1315  	start := keys.RangeDescriptorKey(roachpb.RKeyMin)
  1316  	end := keys.RangeDescriptorKey(roachpb.RKeyMax)
  1317  
  1318  	allCount := 0
  1319  	matchCount := 0
  1320  	bySuffix := make(map[string]int)
  1321  	kvToDesc := func(kv roachpb.KeyValue) (bool, error) {
  1322  		allCount++
  1323  		// Only consider range metadata entries; ignore others.
  1324  		_, suffix, _, err := keys.DecodeRangeKey(kv.Key)
  1325  		if err != nil {
  1326  			return false, err
  1327  		}
  1328  		bySuffix[string(suffix)]++
  1329  		if !bytes.Equal(suffix, keys.LocalRangeDescriptorSuffix) {
  1330  			return false, nil
  1331  		}
  1332  		var desc roachpb.RangeDescriptor
  1333  		if err := kv.Value.GetProto(&desc); err != nil {
  1334  			return false, err
  1335  		}
  1336  		matchCount++
  1337  		return fn(desc)
  1338  	}
  1339  
  1340  	_, err := storage.MVCCIterate(ctx, reader, start, end, hlc.MaxTimestamp,
  1341  		storage.MVCCScanOptions{Inconsistent: true}, kvToDesc)
  1342  	log.Eventf(ctx, "iterated over %d keys to find %d range descriptors (by suffix: %v)",
  1343  		allCount, matchCount, bySuffix)
  1344  	return err
  1345  }
  1346  
  1347  // ReadStoreIdent reads the StoreIdent from the store.
  1348  // It returns *NotBootstrappedError if the ident is missing (meaning that the
  1349  // store needs to be bootstrapped).
  1350  func ReadStoreIdent(ctx context.Context, eng storage.Engine) (roachpb.StoreIdent, error) {
  1351  	var ident roachpb.StoreIdent
  1352  	ok, err := storage.MVCCGetProto(
  1353  		ctx, eng, keys.StoreIdentKey(), hlc.Timestamp{}, &ident, storage.MVCCGetOptions{})
  1354  	if err != nil {
  1355  		return roachpb.StoreIdent{}, err
  1356  	} else if !ok {
  1357  		return roachpb.StoreIdent{}, &NotBootstrappedError{}
  1358  	}
  1359  	return ident, err
  1360  }
  1361  
  1362  // Start the engine, set the GC and read the StoreIdent.
  1363  func (s *Store) Start(ctx context.Context, stopper *stop.Stopper) error {
  1364  	s.stopper = stopper
  1365  
  1366  	// Populate the store ident. If not bootstrapped, ReadStoreIntent will
  1367  	// return an error.
  1368  	ident, err := ReadStoreIdent(ctx, s.engine)
  1369  	if err != nil {
  1370  		return err
  1371  	}
  1372  	s.Ident = &ident
  1373  
  1374  	// Set the store ID for logging.
  1375  	s.cfg.AmbientCtx.AddLogTag("s", s.StoreID())
  1376  	ctx = s.AnnotateCtx(ctx)
  1377  	log.Event(ctx, "read store identity")
  1378  
  1379  	// Add the store ID to the scanner's AmbientContext before starting it, since
  1380  	// the AmbientContext provided during construction did not include it.
  1381  	// Note that this is just a hacky way of getting around that without
  1382  	// refactoring the scanner/queue construction/start logic more broadly, and
  1383  	// depends on the scanner not having added its own log tag.
  1384  	if s.scanner != nil {
  1385  		s.scanner.AmbientContext.AddLogTag("s", s.StoreID())
  1386  	}
  1387  
  1388  	// If the nodeID is 0, it has not be assigned yet.
  1389  	if s.nodeDesc.NodeID != 0 && s.Ident.NodeID != s.nodeDesc.NodeID {
  1390  		return errors.Errorf("node id:%d does not equal the one in node descriptor:%d", s.Ident.NodeID, s.nodeDesc.NodeID)
  1391  	}
  1392  	// Always set gossip NodeID before gossiping any info.
  1393  	if s.cfg.Gossip != nil {
  1394  		s.cfg.Gossip.NodeID.Set(ctx, s.Ident.NodeID)
  1395  	}
  1396  
  1397  	// Create ID allocators.
  1398  	idAlloc, err := idalloc.NewAllocator(idalloc.Options{
  1399  		AmbientCtx:  s.cfg.AmbientCtx,
  1400  		Key:         keys.RangeIDGenerator,
  1401  		Incrementer: idalloc.DBIncrementer(s.db),
  1402  		BlockSize:   rangeIDAllocCount,
  1403  		Stopper:     s.stopper,
  1404  	})
  1405  	if err != nil {
  1406  		return err
  1407  	}
  1408  
  1409  	// Create the intent resolver.
  1410  	s.intentResolver = intentresolver.New(intentresolver.Config{
  1411  		Clock:                s.cfg.Clock,
  1412  		DB:                   s.db,
  1413  		Stopper:              stopper,
  1414  		TaskLimit:            s.cfg.IntentResolverTaskLimit,
  1415  		AmbientCtx:           s.cfg.AmbientCtx,
  1416  		TestingKnobs:         s.cfg.TestingKnobs.IntentResolverKnobs,
  1417  		RangeDescriptorCache: s.cfg.RangeDescriptorCache,
  1418  	})
  1419  	s.metrics.registry.AddMetricStruct(s.intentResolver.Metrics)
  1420  
  1421  	// Create the recovery manager.
  1422  	s.recoveryMgr = txnrecovery.NewManager(
  1423  		s.cfg.AmbientCtx, s.cfg.Clock, s.db, stopper,
  1424  	)
  1425  	s.metrics.registry.AddMetricStruct(s.recoveryMgr.Metrics())
  1426  
  1427  	s.rangeIDAlloc = idAlloc
  1428  
  1429  	now := s.cfg.Clock.Now()
  1430  	s.startedAt = now.WallTime
  1431  
  1432  	// Iterate over all range descriptors, ignoring uncommitted versions
  1433  	// (consistent=false). Uncommitted intents which have been abandoned
  1434  	// due to a split crashing halfway will simply be resolved on the
  1435  	// next split attempt. They can otherwise be ignored.
  1436  
  1437  	// TODO(peter): While we have to iterate to find the replica descriptors
  1438  	// serially, we can perform the migrations and replica creation
  1439  	// concurrently. Note that while we can perform this initialization
  1440  	// concurrently, all of the initialization must be performed before we start
  1441  	// listening for Raft messages and starting the process Raft loop.
  1442  	err = IterateRangeDescriptors(ctx, s.engine,
  1443  		func(desc roachpb.RangeDescriptor) (bool, error) {
  1444  			if !desc.IsInitialized() {
  1445  				return false, errors.Errorf("found uninitialized RangeDescriptor: %+v", desc)
  1446  			}
  1447  			replicaDesc, found := desc.GetReplicaDescriptor(s.StoreID())
  1448  			if !found {
  1449  				// This is a pre-emptive snapshot. It's also possible that this is a
  1450  				// range which has processed a raft command to remove itself (which is
  1451  				// possible prior to 19.2 or if the DisableEagerReplicaRemoval is
  1452  				// enabled) and has not yet been removed by the replica gc queue.
  1453  				// We treat both cases the same way. These should no longer exist in
  1454  				// 20.2 or after as there was a migration in 20.1 to remove them and
  1455  				// no pre-emptive snapshot should have been sent since 19.2 was
  1456  				// finalized.
  1457  				return false /* done */, errors.AssertionFailedf(
  1458  					"found RangeDescriptor for range %d at generation %d which does not"+
  1459  						" contain this store %d",
  1460  					log.Safe(desc.RangeID),
  1461  					log.Safe(desc.Generation),
  1462  					log.Safe(s.StoreID()))
  1463  			}
  1464  
  1465  			rep, err := newReplica(ctx, &desc, s, replicaDesc.ReplicaID)
  1466  			if err != nil {
  1467  				return false, err
  1468  			}
  1469  
  1470  			// We can't lock s.mu across NewReplica due to the lock ordering
  1471  			// constraint (*Replica).raftMu < (*Store).mu. See the comment on
  1472  			// (Store).mu.
  1473  			s.mu.Lock()
  1474  			err = s.addReplicaInternalLocked(rep)
  1475  			s.mu.Unlock()
  1476  			if err != nil {
  1477  				return false, err
  1478  			}
  1479  
  1480  			// Add this range and its stats to our counter.
  1481  			s.metrics.ReplicaCount.Inc(1)
  1482  			s.metrics.addMVCCStats(rep.GetMVCCStats())
  1483  
  1484  			if _, ok := desc.GetReplicaDescriptor(s.StoreID()); !ok {
  1485  				// We are no longer a member of the range, but we didn't GC the replica
  1486  				// before shutting down. Add the replica to the GC queue.
  1487  				s.replicaGCQueue.AddAsync(ctx, rep, replicaGCPriorityRemoved)
  1488  			}
  1489  
  1490  			// Note that we do not create raft groups at this time; they will be created
  1491  			// on-demand the first time they are needed. This helps reduce the amount of
  1492  			// election-related traffic in a cold start.
  1493  			// Raft initialization occurs when we propose a command on this range or
  1494  			// receive a raft message addressed to it.
  1495  			// TODO(bdarnell): Also initialize raft groups when read leases are needed.
  1496  			// TODO(bdarnell): Scan all ranges at startup for unapplied log entries
  1497  			// and initialize those groups.
  1498  			return false, nil
  1499  		})
  1500  	if err != nil {
  1501  		return err
  1502  	}
  1503  
  1504  	// Start Raft processing goroutines.
  1505  	s.cfg.Transport.Listen(s.StoreID(), s)
  1506  	s.processRaft(ctx)
  1507  
  1508  	// Register a callback to unquiesce any ranges with replicas on a
  1509  	// node transitioning from non-live to live.
  1510  	if s.cfg.NodeLiveness != nil {
  1511  		s.cfg.NodeLiveness.RegisterCallback(s.nodeIsLiveCallback)
  1512  	}
  1513  
  1514  	// Gossip is only ever nil while bootstrapping a cluster and
  1515  	// in unittests.
  1516  	if s.cfg.Gossip != nil {
  1517  		// Register update channel for any changes to the system config.
  1518  		// This may trigger splits along structured boundaries,
  1519  		// and update max range bytes.
  1520  		gossipUpdateC := s.cfg.Gossip.RegisterSystemConfigChannel()
  1521  		s.stopper.RunWorker(ctx, func(context.Context) {
  1522  			for {
  1523  				select {
  1524  				case <-gossipUpdateC:
  1525  					cfg := s.cfg.Gossip.GetSystemConfig()
  1526  					s.systemGossipUpdate(cfg)
  1527  				case <-s.stopper.ShouldStop():
  1528  					return
  1529  				}
  1530  			}
  1531  		})
  1532  
  1533  		// Start a single goroutine in charge of periodically gossiping the
  1534  		// sentinel and first range metadata if we have a first range.
  1535  		// This may wake up ranges and requires everything to be set up and
  1536  		// running.
  1537  		s.startGossip()
  1538  
  1539  		// Start the scanner. The construction here makes sure that the scanner
  1540  		// only starts after Gossip has connected, and that it does not block Start
  1541  		// from returning (as doing so might prevent Gossip from ever connecting).
  1542  		s.stopper.RunWorker(ctx, func(context.Context) {
  1543  			select {
  1544  			case <-s.cfg.Gossip.Connected:
  1545  				s.scanner.Start(s.stopper)
  1546  			case <-s.stopper.ShouldStop():
  1547  				return
  1548  			}
  1549  		})
  1550  	}
  1551  
  1552  	if !s.cfg.TestingKnobs.DisableAutomaticLeaseRenewal {
  1553  		s.startLeaseRenewer(ctx)
  1554  	}
  1555  
  1556  	// Connect rangefeeds to closed timestamp updates.
  1557  	s.startClosedTimestampRangefeedSubscriber(ctx)
  1558  
  1559  	if s.replicateQueue != nil {
  1560  		s.storeRebalancer = NewStoreRebalancer(
  1561  			s.cfg.AmbientCtx, s.cfg.Settings, s.replicateQueue, s.replRankings)
  1562  		s.storeRebalancer.Start(ctx, s.stopper)
  1563  	}
  1564  
  1565  	// Start the storage engine compactor.
  1566  	if envutil.EnvOrDefaultBool("COCKROACH_ENABLE_COMPACTOR", true) {
  1567  		s.compactor.Start(s.AnnotateCtx(context.Background()), s.stopper)
  1568  	}
  1569  
  1570  	// Set the started flag (for unittests).
  1571  	atomic.StoreInt32(&s.started, 1)
  1572  
  1573  	return nil
  1574  }
  1575  
  1576  // WaitForInit waits for any asynchronous processes begun in Start()
  1577  // to complete their initialization. In particular, this includes
  1578  // gossiping. In some cases this may block until the range GC queue
  1579  // has completed its scan. Only for testing.
  1580  func (s *Store) WaitForInit() {
  1581  	s.initComplete.Wait()
  1582  }
  1583  
  1584  var errPeriodicGossipsDisabled = errors.New("periodic gossip is disabled")
  1585  
  1586  // startGossip runs an infinite loop in a goroutine which regularly checks
  1587  // whether the store has a first range or config replica and asks those ranges
  1588  // to gossip accordingly.
  1589  func (s *Store) startGossip() {
  1590  	wakeReplica := func(ctx context.Context, repl *Replica) error {
  1591  		// Acquire the range lease, which in turn triggers system data gossip
  1592  		// functions (e.g. MaybeGossipSystemConfig or MaybeGossipNodeLiveness).
  1593  		_, pErr := repl.getLeaseForGossip(ctx)
  1594  		return pErr.GoError()
  1595  	}
  1596  
  1597  	if s.cfg.TestingKnobs.DisablePeriodicGossips {
  1598  		wakeReplica = func(context.Context, *Replica) error {
  1599  			return errPeriodicGossipsDisabled
  1600  		}
  1601  	}
  1602  
  1603  	gossipFns := []struct {
  1604  		key         roachpb.Key
  1605  		fn          func(context.Context, *Replica) error
  1606  		description string
  1607  		interval    time.Duration
  1608  	}{
  1609  		{
  1610  			key: roachpb.KeyMin,
  1611  			fn: func(ctx context.Context, repl *Replica) error {
  1612  				// The first range is gossiped by all replicas, not just the lease
  1613  				// holder, so wakeReplica is not used here.
  1614  				return repl.maybeGossipFirstRange(ctx).GoError()
  1615  			},
  1616  			description: "first range descriptor",
  1617  			interval:    s.cfg.SentinelGossipTTL() / 2,
  1618  		},
  1619  		{
  1620  			key:         keys.SystemConfigSpan.Key,
  1621  			fn:          wakeReplica,
  1622  			description: "system config",
  1623  			interval:    systemDataGossipInterval,
  1624  		},
  1625  		{
  1626  			key:         keys.NodeLivenessSpan.Key,
  1627  			fn:          wakeReplica,
  1628  			description: "node liveness",
  1629  			interval:    systemDataGossipInterval,
  1630  		},
  1631  	}
  1632  
  1633  	// Periodic updates run in a goroutine and signal a WaitGroup upon completion
  1634  	// of their first iteration.
  1635  	s.initComplete.Add(len(gossipFns))
  1636  	for _, gossipFn := range gossipFns {
  1637  		gossipFn := gossipFn // per-iteration copy
  1638  		s.stopper.RunWorker(context.Background(), func(ctx context.Context) {
  1639  			ticker := time.NewTicker(gossipFn.interval)
  1640  			defer ticker.Stop()
  1641  			for first := true; ; {
  1642  				// Retry in a backoff loop until gossipFn succeeds. The gossipFn might
  1643  				// temporarily fail (e.g. because node liveness hasn't initialized yet
  1644  				// making it impossible to get an epoch-based range lease), in which
  1645  				// case we want to retry quickly.
  1646  				retryOptions := base.DefaultRetryOptions()
  1647  				retryOptions.Closer = s.stopper.ShouldStop()
  1648  				for r := retry.Start(retryOptions); r.Next(); {
  1649  					if repl := s.LookupReplica(roachpb.RKey(gossipFn.key)); repl != nil {
  1650  						annotatedCtx := repl.AnnotateCtx(ctx)
  1651  						if err := gossipFn.fn(annotatedCtx, repl); err != nil {
  1652  							log.Warningf(annotatedCtx, "could not gossip %s: %+v", gossipFn.description, err)
  1653  							if !errors.Is(err, errPeriodicGossipsDisabled) {
  1654  								continue
  1655  							}
  1656  						}
  1657  					}
  1658  					break
  1659  				}
  1660  				if first {
  1661  					first = false
  1662  					s.initComplete.Done()
  1663  				}
  1664  				select {
  1665  				case <-ticker.C:
  1666  				case <-s.stopper.ShouldStop():
  1667  					return
  1668  				}
  1669  			}
  1670  		})
  1671  	}
  1672  }
  1673  
  1674  // startLeaseRenewer runs an infinite loop in a goroutine which regularly
  1675  // checks whether the store has any expiration-based leases that should be
  1676  // proactively renewed and attempts to continue renewing them.
  1677  //
  1678  // This reduces user-visible latency when range lookups are needed to serve a
  1679  // request and reduces ping-ponging of r1's lease to different replicas as
  1680  // maybeGossipFirstRange is called on each (e.g.  #24753).
  1681  func (s *Store) startLeaseRenewer(ctx context.Context) {
  1682  	// Start a goroutine that watches and proactively renews certain
  1683  	// expiration-based leases.
  1684  	s.stopper.RunWorker(ctx, func(ctx context.Context) {
  1685  		repls := make(map[*Replica]struct{})
  1686  		timer := timeutil.NewTimer()
  1687  		defer timer.Stop()
  1688  
  1689  		// Determine how frequently to attempt to ensure that we have each lease.
  1690  		// The divisor used here is somewhat arbitrary, but needs to be large
  1691  		// enough to ensure we'll attempt to renew the lease reasonably early
  1692  		// within the RangeLeaseRenewalDuration time window. This means we'll wake
  1693  		// up more often that strictly necessary, but it's more maintainable than
  1694  		// attempting to accurately determine exactly when each iteration of a
  1695  		// lease expires and when we should attempt to renew it as a result.
  1696  		renewalDuration := s.cfg.RangeLeaseActiveDuration() / 5
  1697  		for {
  1698  			s.renewableLeases.Range(func(k int64, v unsafe.Pointer) bool {
  1699  				repl := (*Replica)(v)
  1700  				annotatedCtx := repl.AnnotateCtx(ctx)
  1701  				if _, pErr := repl.redirectOnOrAcquireLease(annotatedCtx); pErr != nil {
  1702  					if _, ok := pErr.GetDetail().(*roachpb.NotLeaseHolderError); !ok {
  1703  						log.Warningf(annotatedCtx, "failed to proactively renew lease: %s", pErr)
  1704  					}
  1705  					s.renewableLeases.Delete(k)
  1706  				}
  1707  				return true
  1708  			})
  1709  
  1710  			if len(repls) > 0 {
  1711  				timer.Reset(renewalDuration)
  1712  			}
  1713  			select {
  1714  			case <-s.renewableLeasesSignal:
  1715  			case <-timer.C:
  1716  				timer.Read = true
  1717  			case <-s.stopper.ShouldStop():
  1718  				return
  1719  			}
  1720  		}
  1721  	})
  1722  }
  1723  
  1724  // startClosedTimestampRangefeedSubscriber establishes a new ClosedTimestamp
  1725  // subscription and runs an infinite loop to listen for closed timestamp updates
  1726  // and inform Replicas with active Rangefeeds about them.
  1727  func (s *Store) startClosedTimestampRangefeedSubscriber(ctx context.Context) {
  1728  	// NB: We can't use Stopper.RunWorker because doing so would race with
  1729  	// calling Stopper.Stop. We give the subscription channel a small capacity
  1730  	// to avoid blocking the closed timestamp goroutine.
  1731  	ch := make(chan ctpb.Entry, 8)
  1732  	const name = "closedts-rangefeed-subscriber"
  1733  	if err := s.stopper.RunAsyncTask(ctx, name, func(ctx context.Context) {
  1734  		s.cfg.ClosedTimestamp.Provider.Subscribe(ctx, ch)
  1735  	}); err != nil {
  1736  		return
  1737  	}
  1738  
  1739  	s.stopper.RunWorker(ctx, func(ctx context.Context) {
  1740  		var replIDs []roachpb.RangeID
  1741  		for {
  1742  			select {
  1743  			case <-ch:
  1744  				// Drain all notifications from the channel.
  1745  			loop:
  1746  				for {
  1747  					select {
  1748  					case _, ok := <-ch:
  1749  						if !ok {
  1750  							break loop
  1751  						}
  1752  					default:
  1753  						break loop
  1754  					}
  1755  				}
  1756  
  1757  				// Gather replicas to notify under lock.
  1758  				s.rangefeedReplicas.Lock()
  1759  				for replID := range s.rangefeedReplicas.m {
  1760  					replIDs = append(replIDs, replID)
  1761  				}
  1762  				s.rangefeedReplicas.Unlock()
  1763  
  1764  				// Notify each replica with an active rangefeed to
  1765  				// check for an updated closed timestamp.
  1766  				for _, replID := range replIDs {
  1767  					repl, err := s.GetReplica(replID)
  1768  					if err != nil {
  1769  						continue
  1770  					}
  1771  					repl.handleClosedTimestampUpdate(ctx)
  1772  				}
  1773  				replIDs = replIDs[:0]
  1774  			case <-s.stopper.ShouldQuiesce():
  1775  				return
  1776  			}
  1777  		}
  1778  	})
  1779  }
  1780  
  1781  func (s *Store) addReplicaWithRangefeed(rangeID roachpb.RangeID) {
  1782  	s.rangefeedReplicas.Lock()
  1783  	s.rangefeedReplicas.m[rangeID] = struct{}{}
  1784  	s.rangefeedReplicas.Unlock()
  1785  }
  1786  
  1787  func (s *Store) removeReplicaWithRangefeed(rangeID roachpb.RangeID) {
  1788  	s.rangefeedReplicas.Lock()
  1789  	delete(s.rangefeedReplicas.m, rangeID)
  1790  	s.rangefeedReplicas.Unlock()
  1791  }
  1792  
  1793  // systemGossipUpdate is a callback for gossip updates to
  1794  // the system config which affect range split boundaries.
  1795  func (s *Store) systemGossipUpdate(sysCfg *config.SystemConfig) {
  1796  	ctx := s.AnnotateCtx(context.Background())
  1797  	s.computeInitialMetrics.Do(func() {
  1798  		// Metrics depend in part on the system config. Compute them as soon as we
  1799  		// get the first system config, then periodically in the background
  1800  		// (managed by the Node).
  1801  		if err := s.ComputeMetrics(ctx, -1); err != nil {
  1802  			log.Infof(ctx, "%s: failed initial metrics computation: %s", s, err)
  1803  		}
  1804  		log.Event(ctx, "computed initial metrics")
  1805  	})
  1806  
  1807  	// We'll want to offer all replicas to the split and merge queues. Be a little
  1808  	// careful about not spawning too many individual goroutines.
  1809  
  1810  	// For every range, update its zone config and check if it needs to
  1811  	// be split or merged.
  1812  	now := s.cfg.Clock.Now()
  1813  	newStoreReplicaVisitor(s).Visit(func(repl *Replica) bool {
  1814  		key := repl.Desc().StartKey
  1815  		zone, err := sysCfg.GetZoneConfigForKey(key)
  1816  		if err != nil {
  1817  			if log.V(1) {
  1818  				log.Infof(context.TODO(), "failed to get zone config for key %s", key)
  1819  			}
  1820  			zone = s.cfg.DefaultZoneConfig
  1821  		}
  1822  		repl.SetZoneConfig(zone)
  1823  		s.splitQueue.Async(ctx, "gossip update", true /* wait */, func(ctx context.Context, h queueHelper) {
  1824  			h.MaybeAdd(ctx, repl, now)
  1825  		})
  1826  		s.mergeQueue.Async(ctx, "gossip update", true /* wait */, func(ctx context.Context, h queueHelper) {
  1827  			h.MaybeAdd(ctx, repl, now)
  1828  		})
  1829  		return true // more
  1830  	})
  1831  }
  1832  
  1833  func (s *Store) asyncGossipStore(ctx context.Context, reason string, useCached bool) {
  1834  	if err := s.stopper.RunAsyncTask(
  1835  		ctx, fmt.Sprintf("storage.Store: gossip on %s", reason),
  1836  		func(ctx context.Context) {
  1837  			if err := s.GossipStore(ctx, useCached); err != nil {
  1838  				log.Warningf(ctx, "error gossiping on %s: %+v", reason, err)
  1839  			}
  1840  		}); err != nil {
  1841  		log.Warningf(ctx, "unable to gossip on %s: %+v", reason, err)
  1842  	}
  1843  }
  1844  
  1845  // GossipStore broadcasts the store on the gossip network.
  1846  func (s *Store) GossipStore(ctx context.Context, useCached bool) error {
  1847  	// Temporarily indicate that we're gossiping the store capacity to avoid
  1848  	// recursively triggering a gossip of the store capacity.
  1849  	syncutil.StoreFloat64(&s.gossipQueriesPerSecondVal, -1)
  1850  	syncutil.StoreFloat64(&s.gossipWritesPerSecondVal, -1)
  1851  
  1852  	storeDesc, err := s.Descriptor(useCached)
  1853  	if err != nil {
  1854  		return errors.Wrapf(err, "problem getting store descriptor for store %+v", s.Ident)
  1855  	}
  1856  
  1857  	// Set countdown target for re-gossiping capacity earlier than
  1858  	// the usual periodic interval. Re-gossip more rapidly for RangeCount
  1859  	// changes because allocators with stale information are much more
  1860  	// likely to make bad decisions.
  1861  	rangeCountdown := float64(storeDesc.Capacity.RangeCount) * s.cfg.GossipWhenCapacityDeltaExceedsFraction
  1862  	atomic.StoreInt32(&s.gossipRangeCountdown, int32(math.Ceil(math.Min(rangeCountdown, 3))))
  1863  	leaseCountdown := float64(storeDesc.Capacity.LeaseCount) * s.cfg.GossipWhenCapacityDeltaExceedsFraction
  1864  	atomic.StoreInt32(&s.gossipLeaseCountdown, int32(math.Ceil(math.Max(leaseCountdown, 1))))
  1865  	syncutil.StoreFloat64(&s.gossipQueriesPerSecondVal, storeDesc.Capacity.QueriesPerSecond)
  1866  	syncutil.StoreFloat64(&s.gossipWritesPerSecondVal, storeDesc.Capacity.WritesPerSecond)
  1867  
  1868  	// Unique gossip key per store.
  1869  	gossipStoreKey := gossip.MakeStoreKey(storeDesc.StoreID)
  1870  	// Gossip store descriptor.
  1871  	return s.cfg.Gossip.AddInfoProto(gossipStoreKey, storeDesc, gossip.StoreTTL)
  1872  }
  1873  
  1874  type capacityChangeEvent int
  1875  
  1876  const (
  1877  	rangeAddEvent capacityChangeEvent = iota
  1878  	rangeRemoveEvent
  1879  	leaseAddEvent
  1880  	leaseRemoveEvent
  1881  )
  1882  
  1883  // maybeGossipOnCapacityChange decrements the countdown on range
  1884  // and leaseholder counts. If it reaches 0, then we trigger an
  1885  // immediate gossip of this store's descriptor, to include updated
  1886  // capacity information.
  1887  func (s *Store) maybeGossipOnCapacityChange(ctx context.Context, cce capacityChangeEvent) {
  1888  	if s.cfg.TestingKnobs.DisableLeaseCapacityGossip && (cce == leaseAddEvent || cce == leaseRemoveEvent) {
  1889  		return
  1890  	}
  1891  
  1892  	// Incrementally adjust stats to keep them up to date even if the
  1893  	// capacity is gossiped, but isn't due yet to be recomputed from scratch.
  1894  	s.cachedCapacity.Lock()
  1895  	switch cce {
  1896  	case rangeAddEvent:
  1897  		s.cachedCapacity.RangeCount++
  1898  	case rangeRemoveEvent:
  1899  		s.cachedCapacity.RangeCount--
  1900  	case leaseAddEvent:
  1901  		s.cachedCapacity.LeaseCount++
  1902  	case leaseRemoveEvent:
  1903  		s.cachedCapacity.LeaseCount--
  1904  	}
  1905  	s.cachedCapacity.Unlock()
  1906  
  1907  	if ((cce == rangeAddEvent || cce == rangeRemoveEvent) && atomic.AddInt32(&s.gossipRangeCountdown, -1) == 0) ||
  1908  		((cce == leaseAddEvent || cce == leaseRemoveEvent) && atomic.AddInt32(&s.gossipLeaseCountdown, -1) == 0) {
  1909  		// Reset countdowns to avoid unnecessary gossiping.
  1910  		atomic.StoreInt32(&s.gossipRangeCountdown, 0)
  1911  		atomic.StoreInt32(&s.gossipLeaseCountdown, 0)
  1912  		s.asyncGossipStore(ctx, "capacity change", true /* useCached */)
  1913  	}
  1914  }
  1915  
  1916  // recordNewPerSecondStats takes recently calculated values for the number of
  1917  // queries and key writes the store is handling and decides whether either has
  1918  // changed enough to justify re-gossiping the store's capacity.
  1919  func (s *Store) recordNewPerSecondStats(newQPS, newWPS float64) {
  1920  	oldQPS := syncutil.LoadFloat64(&s.gossipQueriesPerSecondVal)
  1921  	oldWPS := syncutil.LoadFloat64(&s.gossipWritesPerSecondVal)
  1922  	if oldQPS == -1 || oldWPS == -1 {
  1923  		// Gossiping of store capacity is already ongoing.
  1924  		return
  1925  	}
  1926  
  1927  	const minAbsoluteChange = 100
  1928  	updateForQPS := (newQPS < oldQPS*.5 || newQPS > oldQPS*1.5) && math.Abs(newQPS-oldQPS) > minAbsoluteChange
  1929  	updateForWPS := (newWPS < oldWPS*.5 || newWPS > oldWPS*1.5) && math.Abs(newWPS-oldWPS) > minAbsoluteChange
  1930  
  1931  	if !updateForQPS && !updateForWPS {
  1932  		return
  1933  	}
  1934  
  1935  	var message string
  1936  	if updateForQPS && updateForWPS {
  1937  		message = "queries-per-second and writes-per-second change"
  1938  	} else if updateForQPS {
  1939  		message = "queries-per-second change"
  1940  	} else {
  1941  		message = "writes-per-second change"
  1942  	}
  1943  	// TODO(a-robinson): Use the provided values to avoid having to recalculate
  1944  	// them in GossipStore.
  1945  	s.asyncGossipStore(context.TODO(), message, false /* useCached */)
  1946  }
  1947  
  1948  // VisitReplicas invokes the visitor on the Store's Replicas until the visitor returns false.
  1949  // Replicas which are added to the Store after iteration begins may or may not be observed.
  1950  func (s *Store) VisitReplicas(visitor func(*Replica) (wantMore bool)) {
  1951  	v := newStoreReplicaVisitor(s)
  1952  	v.Visit(visitor)
  1953  }
  1954  
  1955  // WriteLastUpTimestamp records the supplied timestamp into the "last up" key
  1956  // on this store. This value should be refreshed whenever this store's node
  1957  // updates its own liveness record; it is used by a restarting store to
  1958  // determine the approximate time that it stopped.
  1959  func (s *Store) WriteLastUpTimestamp(ctx context.Context, time hlc.Timestamp) error {
  1960  	ctx = s.AnnotateCtx(ctx)
  1961  	return storage.MVCCPutProto(
  1962  		ctx,
  1963  		s.engine,
  1964  		nil,
  1965  		keys.StoreLastUpKey(),
  1966  		hlc.Timestamp{},
  1967  		nil,
  1968  		&time,
  1969  	)
  1970  }
  1971  
  1972  // ReadLastUpTimestamp returns the "last up" timestamp recorded in this store.
  1973  // This value can be used to approximate the last time the engine was was being
  1974  // served as a store by a running node. If the store does not contain a "last
  1975  // up" timestamp (for example, on a newly bootstrapped store), the zero
  1976  // timestamp is returned instead.
  1977  func (s *Store) ReadLastUpTimestamp(ctx context.Context) (hlc.Timestamp, error) {
  1978  	var timestamp hlc.Timestamp
  1979  	ok, err := storage.MVCCGetProto(ctx, s.Engine(), keys.StoreLastUpKey(), hlc.Timestamp{},
  1980  		&timestamp, storage.MVCCGetOptions{})
  1981  	if err != nil {
  1982  		return hlc.Timestamp{}, err
  1983  	} else if !ok {
  1984  		return hlc.Timestamp{}, nil
  1985  	}
  1986  	return timestamp, nil
  1987  }
  1988  
  1989  // WriteHLCUpperBound records an upper bound to the wall time of the HLC
  1990  func (s *Store) WriteHLCUpperBound(ctx context.Context, time int64) error {
  1991  	ctx = s.AnnotateCtx(ctx)
  1992  	ts := hlc.Timestamp{WallTime: time}
  1993  	batch := s.Engine().NewBatch()
  1994  	// Write has to sync to disk to ensure HLC monotonicity across restarts
  1995  	defer batch.Close()
  1996  	if err := storage.MVCCPutProto(
  1997  		ctx,
  1998  		batch,
  1999  		nil,
  2000  		keys.StoreHLCUpperBoundKey(),
  2001  		hlc.Timestamp{},
  2002  		nil,
  2003  		&ts,
  2004  	); err != nil {
  2005  		return err
  2006  	}
  2007  
  2008  	if err := batch.Commit(true /* sync */); err != nil {
  2009  		return err
  2010  	}
  2011  	return nil
  2012  }
  2013  
  2014  // ReadHLCUpperBound returns the upper bound to the wall time of the HLC
  2015  // If this value does not exist 0 is returned
  2016  func ReadHLCUpperBound(ctx context.Context, e storage.Engine) (int64, error) {
  2017  	var timestamp hlc.Timestamp
  2018  	ok, err := storage.MVCCGetProto(ctx, e, keys.StoreHLCUpperBoundKey(), hlc.Timestamp{},
  2019  		&timestamp, storage.MVCCGetOptions{})
  2020  	if err != nil {
  2021  		return 0, err
  2022  	} else if !ok {
  2023  		return 0, nil
  2024  	}
  2025  	return timestamp.WallTime, nil
  2026  }
  2027  
  2028  // ReadMaxHLCUpperBound returns the maximum of the stored hlc upper bounds
  2029  // among all the engines. This value is optionally persisted by the server and
  2030  // it is guaranteed to be higher than any wall time used by the HLC. If this
  2031  // value is persisted, HLC wall clock monotonicity is guaranteed across server
  2032  // restarts
  2033  func ReadMaxHLCUpperBound(ctx context.Context, engines []storage.Engine) (int64, error) {
  2034  	var hlcUpperBound int64
  2035  	for _, e := range engines {
  2036  		engineHLCUpperBound, err := ReadHLCUpperBound(ctx, e)
  2037  		if err != nil {
  2038  			return 0, err
  2039  		}
  2040  		if engineHLCUpperBound > hlcUpperBound {
  2041  			hlcUpperBound = engineHLCUpperBound
  2042  		}
  2043  	}
  2044  	return hlcUpperBound, nil
  2045  }
  2046  
  2047  // checkCanInitializeEngine ensures that the engine is empty except for a
  2048  // cluster version, which must be present.
  2049  func checkCanInitializeEngine(ctx context.Context, eng storage.Engine) error {
  2050  	kvs, err := storage.Scan(eng, roachpb.KeyMin, roachpb.KeyMax, 10)
  2051  	if err != nil {
  2052  		return err
  2053  	}
  2054  	// See if this is an already-bootstrapped store.
  2055  	ident, err := ReadStoreIdent(ctx, eng)
  2056  	if err == nil {
  2057  		return errors.Errorf("engine already initialized as %s", ident.String())
  2058  	} else if !errors.HasType(err, (*NotBootstrappedError)(nil)) {
  2059  		return errors.Wrap(err, "unable to read store ident")
  2060  	}
  2061  
  2062  	// Engine is not bootstrapped yet (i.e. no StoreIdent). Does it contain
  2063  	// a cluster version and nothing else?
  2064  
  2065  	var sawClusterVersion bool
  2066  	var keyVals []string
  2067  	for _, kv := range kvs {
  2068  		if kv.Key.Key.Equal(keys.StoreClusterVersionKey()) {
  2069  			sawClusterVersion = true
  2070  			continue
  2071  		}
  2072  		keyVals = append(keyVals, fmt.Sprintf("%s: %q", kv.Key, kv.Value))
  2073  	}
  2074  	if len(keyVals) > 0 {
  2075  		return errors.Errorf("engine cannot be bootstrapped, contains:\n%s", keyVals)
  2076  	}
  2077  	if !sawClusterVersion {
  2078  		return errors.New("no cluster version found on uninitialized engine")
  2079  	}
  2080  
  2081  	return nil
  2082  }
  2083  
  2084  // GetReplica fetches a replica by Range ID. Returns an error if no replica is found.
  2085  func (s *Store) GetReplica(rangeID roachpb.RangeID) (*Replica, error) {
  2086  	if value, ok := s.mu.replicas.Load(int64(rangeID)); ok {
  2087  		return (*Replica)(value), nil
  2088  	}
  2089  	return nil, roachpb.NewRangeNotFoundError(rangeID, s.StoreID())
  2090  }
  2091  
  2092  // LookupReplica looks up the replica that contains the specified key. It
  2093  // returns nil if no such replica exists.
  2094  func (s *Store) LookupReplica(key roachpb.RKey) *Replica {
  2095  	s.mu.RLock()
  2096  	defer s.mu.RUnlock()
  2097  	var repl *Replica
  2098  	s.mu.replicasByKey.DescendLessOrEqual(rangeBTreeKey(key), func(item btree.Item) bool {
  2099  		repl, _ = item.(*Replica)
  2100  		// Stop iterating immediately. The first item we see is the only one that
  2101  		// can possibly contain key.
  2102  		return false
  2103  	})
  2104  	if repl == nil || !repl.Desc().ContainsKey(key) {
  2105  		return nil
  2106  	}
  2107  	return repl
  2108  }
  2109  
  2110  // lookupPrecedingReplica finds the replica in this store that immediately
  2111  // precedes the specified key without containing it. It returns nil if no such
  2112  // replica exists. It ignores replica placeholders.
  2113  //
  2114  // Concretely, when key represents a key within replica R,
  2115  // lookupPrecedingReplica returns the replica that immediately precedes R in
  2116  // replicasByKey.
  2117  func (s *Store) lookupPrecedingReplica(key roachpb.RKey) *Replica {
  2118  	s.mu.RLock()
  2119  	defer s.mu.RUnlock()
  2120  	var repl *Replica
  2121  	s.mu.replicasByKey.DescendLessOrEqual(rangeBTreeKey(key), func(item btree.Item) bool {
  2122  		if r, ok := item.(*Replica); ok && !r.ContainsKey(key.AsRawKey()) {
  2123  			repl = r
  2124  			return false // stop iterating
  2125  		}
  2126  		return true // keep iterating
  2127  	})
  2128  	return repl
  2129  }
  2130  
  2131  // getOverlappingKeyRangeLocked returns a KeyRange from the Store overlapping the given
  2132  // descriptor (or nil if no such KeyRange exists).
  2133  func (s *Store) getOverlappingKeyRangeLocked(rngDesc *roachpb.RangeDescriptor) KeyRange {
  2134  	var kr KeyRange
  2135  	s.mu.replicasByKey.DescendLessOrEqual(rangeBTreeKey(rngDesc.EndKey),
  2136  		func(item btree.Item) bool {
  2137  			if kr0 := item.(KeyRange); kr0.startKey().Less(rngDesc.EndKey) {
  2138  				kr = kr0
  2139  				return false // stop iterating
  2140  			}
  2141  			return true // keep iterating
  2142  		})
  2143  	if kr != nil && rngDesc.StartKey.Less(kr.Desc().EndKey) {
  2144  		return kr
  2145  	}
  2146  	return nil
  2147  }
  2148  
  2149  // RaftStatus returns the current raft status of the local replica of
  2150  // the given range.
  2151  func (s *Store) RaftStatus(rangeID roachpb.RangeID) *raft.Status {
  2152  	if value, ok := s.mu.replicas.Load(int64(rangeID)); ok {
  2153  		return (*Replica)(value).RaftStatus()
  2154  	}
  2155  	return nil
  2156  }
  2157  
  2158  // ClusterID accessor.
  2159  func (s *Store) ClusterID() uuid.UUID { return s.Ident.ClusterID }
  2160  
  2161  // StoreID accessor.
  2162  func (s *Store) StoreID() roachpb.StoreID { return s.Ident.StoreID }
  2163  
  2164  // Clock accessor.
  2165  func (s *Store) Clock() *hlc.Clock { return s.cfg.Clock }
  2166  
  2167  // Engine accessor.
  2168  func (s *Store) Engine() storage.Engine { return s.engine }
  2169  
  2170  // DB accessor.
  2171  func (s *Store) DB() *kv.DB { return s.cfg.DB }
  2172  
  2173  // Gossip accessor.
  2174  func (s *Store) Gossip() *gossip.Gossip { return s.cfg.Gossip }
  2175  
  2176  // Compactor accessor.
  2177  func (s *Store) Compactor() *compactor.Compactor { return s.compactor }
  2178  
  2179  // Stopper accessor.
  2180  func (s *Store) Stopper() *stop.Stopper { return s.stopper }
  2181  
  2182  // TestingKnobs accessor.
  2183  func (s *Store) TestingKnobs() *StoreTestingKnobs { return &s.cfg.TestingKnobs }
  2184  
  2185  // IsDraining accessor.
  2186  func (s *Store) IsDraining() bool {
  2187  	return s.draining.Load().(bool)
  2188  }
  2189  
  2190  // AllocateRangeID allocates a new RangeID from the cluster-wide RangeID allocator.
  2191  func (s *Store) AllocateRangeID(ctx context.Context) (roachpb.RangeID, error) {
  2192  	id, err := s.rangeIDAlloc.Allocate(ctx)
  2193  	if err != nil {
  2194  		return 0, err
  2195  	}
  2196  	return roachpb.RangeID(id), nil
  2197  }
  2198  
  2199  // Attrs returns the attributes of the underlying store.
  2200  func (s *Store) Attrs() roachpb.Attributes {
  2201  	return s.engine.Attrs()
  2202  }
  2203  
  2204  // Capacity returns the capacity of the underlying storage engine. Note that
  2205  // this does not include reservations.
  2206  // Note that Capacity() has the side effect of updating some of the store's
  2207  // internal statistics about its replicas.
  2208  func (s *Store) Capacity(useCached bool) (roachpb.StoreCapacity, error) {
  2209  	if useCached {
  2210  		s.cachedCapacity.Lock()
  2211  		capacity := s.cachedCapacity.StoreCapacity
  2212  		s.cachedCapacity.Unlock()
  2213  		if capacity != (roachpb.StoreCapacity{}) {
  2214  			return capacity, nil
  2215  		}
  2216  	}
  2217  
  2218  	capacity, err := s.engine.Capacity()
  2219  	if err != nil {
  2220  		return capacity, err
  2221  	}
  2222  
  2223  	now := s.cfg.Clock.Now()
  2224  	var leaseCount int32
  2225  	var rangeCount int32
  2226  	var logicalBytes int64
  2227  	var totalQueriesPerSecond float64
  2228  	var totalWritesPerSecond float64
  2229  	replicaCount := s.metrics.ReplicaCount.Value()
  2230  	bytesPerReplica := make([]float64, 0, replicaCount)
  2231  	writesPerReplica := make([]float64, 0, replicaCount)
  2232  	rankingsAccumulator := s.replRankings.newAccumulator()
  2233  	newStoreReplicaVisitor(s).Visit(func(r *Replica) bool {
  2234  		rangeCount++
  2235  		if r.OwnsValidLease(now) {
  2236  			leaseCount++
  2237  		}
  2238  		mvccStats := r.GetMVCCStats()
  2239  		logicalBytes += mvccStats.Total()
  2240  		bytesPerReplica = append(bytesPerReplica, float64(mvccStats.Total()))
  2241  		// TODO(a-robinson): How dangerous is it that these numbers will be
  2242  		// incorrectly low the first time or two it gets gossiped when a store
  2243  		// starts? We can't easily have a countdown as its value changes like for
  2244  		// leases/replicas.
  2245  		var qps float64
  2246  		if avgQPS, dur := r.leaseholderStats.avgQPS(); dur >= MinStatsDuration {
  2247  			qps = avgQPS
  2248  			totalQueriesPerSecond += avgQPS
  2249  			// TODO(a-robinson): Calculate percentiles for qps? Get rid of other percentiles?
  2250  		}
  2251  		if wps, dur := r.writeStats.avgQPS(); dur >= MinStatsDuration {
  2252  			totalWritesPerSecond += wps
  2253  			writesPerReplica = append(writesPerReplica, wps)
  2254  		}
  2255  		rankingsAccumulator.addReplica(replicaWithStats{
  2256  			repl: r,
  2257  			qps:  qps,
  2258  		})
  2259  		return true
  2260  	})
  2261  	capacity.RangeCount = rangeCount
  2262  	capacity.LeaseCount = leaseCount
  2263  	capacity.LogicalBytes = logicalBytes
  2264  	capacity.QueriesPerSecond = totalQueriesPerSecond
  2265  	capacity.WritesPerSecond = totalWritesPerSecond
  2266  	capacity.BytesPerReplica = roachpb.PercentilesFromData(bytesPerReplica)
  2267  	capacity.WritesPerReplica = roachpb.PercentilesFromData(writesPerReplica)
  2268  	s.recordNewPerSecondStats(totalQueriesPerSecond, totalWritesPerSecond)
  2269  	s.replRankings.update(rankingsAccumulator)
  2270  
  2271  	s.cachedCapacity.Lock()
  2272  	s.cachedCapacity.StoreCapacity = capacity
  2273  	s.cachedCapacity.Unlock()
  2274  
  2275  	return capacity, nil
  2276  }
  2277  
  2278  // ReplicaCount returns the number of replicas contained by this store. This
  2279  // method is O(n) in the number of replicas and should not be called from
  2280  // performance critical code.
  2281  func (s *Store) ReplicaCount() int {
  2282  	var count int
  2283  	s.mu.replicas.Range(func(_ int64, _ unsafe.Pointer) bool {
  2284  		count++
  2285  		return true
  2286  	})
  2287  	return count
  2288  }
  2289  
  2290  // Registry returns the store registry.
  2291  func (s *Store) Registry() *metric.Registry {
  2292  	return s.metrics.registry
  2293  }
  2294  
  2295  // Metrics returns the store's metric struct.
  2296  func (s *Store) Metrics() *StoreMetrics {
  2297  	return s.metrics
  2298  }
  2299  
  2300  // Descriptor returns a StoreDescriptor including current store
  2301  // capacity information.
  2302  func (s *Store) Descriptor(useCached bool) (*roachpb.StoreDescriptor, error) {
  2303  	capacity, err := s.Capacity(useCached)
  2304  	if err != nil {
  2305  		return nil, err
  2306  	}
  2307  
  2308  	// Initialize the store descriptor.
  2309  	return &roachpb.StoreDescriptor{
  2310  		StoreID:  s.Ident.StoreID,
  2311  		Attrs:    s.Attrs(),
  2312  		Node:     *s.nodeDesc,
  2313  		Capacity: capacity,
  2314  	}, nil
  2315  }
  2316  
  2317  // RangeFeed registers a rangefeed over the specified span. It sends updates to
  2318  // the provided stream and returns with an optional error when the rangefeed is
  2319  // complete.
  2320  func (s *Store) RangeFeed(
  2321  	args *roachpb.RangeFeedRequest, stream roachpb.Internal_RangeFeedServer,
  2322  ) *roachpb.Error {
  2323  
  2324  	if filter := s.TestingKnobs().TestingRangefeedFilter; filter != nil {
  2325  		if pErr := filter(args, stream); pErr != nil {
  2326  			return pErr
  2327  		}
  2328  	}
  2329  
  2330  	if err := verifyKeys(args.Span.Key, args.Span.EndKey, true); err != nil {
  2331  		return roachpb.NewError(err)
  2332  	}
  2333  
  2334  	// Get range and add command to the range for execution.
  2335  	repl, err := s.GetReplica(args.RangeID)
  2336  	if err != nil {
  2337  		return roachpb.NewError(err)
  2338  	}
  2339  	if !repl.IsInitialized() {
  2340  		// (*Store).Send has an optimization for uninitialized replicas to send back
  2341  		// a NotLeaseHolderError with a hint of where an initialized replica might
  2342  		// be found. RangeFeeds can always be served from followers and so don't
  2343  		// otherwise return NotLeaseHolderError. For simplicity we also don't return
  2344  		// one here.
  2345  		return roachpb.NewError(roachpb.NewRangeNotFoundError(args.RangeID, s.StoreID()))
  2346  	}
  2347  	return repl.RangeFeed(args, stream)
  2348  }
  2349  
  2350  // updateReplicationGauges counts a number of simple replication statistics for
  2351  // the ranges in this store.
  2352  // TODO(bram): #4564 It may be appropriate to compute these statistics while
  2353  // scanning ranges. An ideal solution would be to create incremental events
  2354  // whenever availability changes.
  2355  func (s *Store) updateReplicationGauges(ctx context.Context) error {
  2356  	// Load the system config.
  2357  	cfg := s.Gossip().GetSystemConfig()
  2358  	if cfg == nil {
  2359  		return errors.Errorf("%s: system config not yet available", s)
  2360  	}
  2361  
  2362  	var (
  2363  		raftLeaderCount               int64
  2364  		leaseHolderCount              int64
  2365  		leaseExpirationCount          int64
  2366  		leaseEpochCount               int64
  2367  		raftLeaderNotLeaseHolderCount int64
  2368  		quiescentCount                int64
  2369  		averageQueriesPerSecond       float64
  2370  		averageWritesPerSecond        float64
  2371  
  2372  		rangeCount                int64
  2373  		unavailableRangeCount     int64
  2374  		underreplicatedRangeCount int64
  2375  		overreplicatedRangeCount  int64
  2376  		behindCount               int64
  2377  	)
  2378  
  2379  	timestamp := s.cfg.Clock.Now()
  2380  	var livenessMap IsLiveMap
  2381  	if s.cfg.NodeLiveness != nil {
  2382  		livenessMap = s.cfg.NodeLiveness.GetIsLiveMap()
  2383  	}
  2384  	clusterNodes := s.ClusterNodeCount()
  2385  
  2386  	var minMaxClosedTS hlc.Timestamp
  2387  	newStoreReplicaVisitor(s).Visit(func(rep *Replica) bool {
  2388  		metrics := rep.Metrics(ctx, timestamp, livenessMap, clusterNodes)
  2389  		if metrics.Leader {
  2390  			raftLeaderCount++
  2391  			if metrics.LeaseValid && !metrics.Leaseholder {
  2392  				raftLeaderNotLeaseHolderCount++
  2393  			}
  2394  		}
  2395  		if metrics.Leaseholder {
  2396  			leaseHolderCount++
  2397  			switch metrics.LeaseType {
  2398  			case roachpb.LeaseNone:
  2399  			case roachpb.LeaseExpiration:
  2400  				leaseExpirationCount++
  2401  			case roachpb.LeaseEpoch:
  2402  				leaseEpochCount++
  2403  			}
  2404  		}
  2405  		if metrics.Quiescent {
  2406  			quiescentCount++
  2407  		}
  2408  		if metrics.RangeCounter {
  2409  			rangeCount++
  2410  			if metrics.Unavailable {
  2411  				unavailableRangeCount++
  2412  			}
  2413  			if metrics.Underreplicated {
  2414  				underreplicatedRangeCount++
  2415  			}
  2416  			if metrics.Overreplicated {
  2417  				overreplicatedRangeCount++
  2418  			}
  2419  		}
  2420  		behindCount += metrics.BehindCount
  2421  		if qps, dur := rep.leaseholderStats.avgQPS(); dur >= MinStatsDuration {
  2422  			averageQueriesPerSecond += qps
  2423  		}
  2424  		if wps, dur := rep.writeStats.avgQPS(); dur >= MinStatsDuration {
  2425  			averageWritesPerSecond += wps
  2426  		}
  2427  		mc, ok := rep.maxClosed(ctx)
  2428  		if ok && (minMaxClosedTS.IsEmpty() || mc.Less(minMaxClosedTS)) {
  2429  			minMaxClosedTS = mc
  2430  		}
  2431  		return true // more
  2432  	})
  2433  
  2434  	s.metrics.RaftLeaderCount.Update(raftLeaderCount)
  2435  	s.metrics.RaftLeaderNotLeaseHolderCount.Update(raftLeaderNotLeaseHolderCount)
  2436  	s.metrics.LeaseHolderCount.Update(leaseHolderCount)
  2437  	s.metrics.LeaseExpirationCount.Update(leaseExpirationCount)
  2438  	s.metrics.LeaseEpochCount.Update(leaseEpochCount)
  2439  	s.metrics.QuiescentCount.Update(quiescentCount)
  2440  	s.metrics.AverageQueriesPerSecond.Update(averageQueriesPerSecond)
  2441  	s.metrics.AverageWritesPerSecond.Update(averageWritesPerSecond)
  2442  	s.recordNewPerSecondStats(averageQueriesPerSecond, averageWritesPerSecond)
  2443  
  2444  	s.metrics.RangeCount.Update(rangeCount)
  2445  	s.metrics.UnavailableRangeCount.Update(unavailableRangeCount)
  2446  	s.metrics.UnderReplicatedRangeCount.Update(underreplicatedRangeCount)
  2447  	s.metrics.OverReplicatedRangeCount.Update(overreplicatedRangeCount)
  2448  	s.metrics.RaftLogFollowerBehindCount.Update(behindCount)
  2449  
  2450  	if !minMaxClosedTS.IsEmpty() {
  2451  		nanos := timeutil.Since(minMaxClosedTS.GoTime()).Nanoseconds()
  2452  		s.metrics.ClosedTimestampMaxBehindNanos.Update(nanos)
  2453  	}
  2454  
  2455  	return nil
  2456  }
  2457  
  2458  // checkpoint creates a RocksDB checkpoint in the auxiliary directory with the
  2459  // provided tag used in the filepath. The filepath for the checkpoint directory
  2460  // is returned.
  2461  func (s *Store) checkpoint(ctx context.Context, tag string) (string, error) {
  2462  	checkpointBase := filepath.Join(s.engine.GetAuxiliaryDir(), "checkpoints")
  2463  	_ = os.MkdirAll(checkpointBase, 0700)
  2464  
  2465  	checkpointDir := filepath.Join(checkpointBase, tag)
  2466  	if err := s.engine.CreateCheckpoint(checkpointDir); err != nil {
  2467  		return "", err
  2468  	}
  2469  
  2470  	return checkpointDir, nil
  2471  }
  2472  
  2473  // ComputeMetrics immediately computes the current value of store metrics which
  2474  // cannot be computed incrementally. This method should be invoked periodically
  2475  // by a higher-level system which records store metrics.
  2476  //
  2477  // The tick argument should increment across repeated calls to this
  2478  // method. It is used to compute some metrics less frequently than others.
  2479  func (s *Store) ComputeMetrics(ctx context.Context, tick int) error {
  2480  	ctx = s.AnnotateCtx(ctx)
  2481  	if err := s.updateCapacityGauges(); err != nil {
  2482  		return err
  2483  	}
  2484  	if err := s.updateReplicationGauges(ctx); err != nil {
  2485  		return err
  2486  	}
  2487  
  2488  	// Get the latest RocksDB stats.
  2489  	stats, err := s.engine.GetStats()
  2490  	if err != nil {
  2491  		return err
  2492  	}
  2493  	s.metrics.updateRocksDBStats(*stats)
  2494  
  2495  	// Get engine Env stats.
  2496  	envStats, err := s.engine.GetEnvStats()
  2497  	if err != nil {
  2498  		return err
  2499  	}
  2500  	s.metrics.updateEnvStats(*envStats)
  2501  
  2502  	sstables := s.engine.GetSSTables()
  2503  	s.metrics.RdbNumSSTables.Update(int64(sstables.Len()))
  2504  	readAmp := sstables.ReadAmplification()
  2505  	s.metrics.RdbReadAmplification.Update(int64(readAmp))
  2506  	s.metrics.RdbPendingCompaction.Update(stats.PendingCompactionBytesEstimate)
  2507  	// Log this metric infrequently (with current configurations,
  2508  	// every 10 minutes). Trigger on tick 1 instead of tick 0 so that
  2509  	// non-periodic callers of this method don't trigger expensive
  2510  	// stats.
  2511  	if tick%logSSTInfoTicks == 1 /* every 10m */ {
  2512  		log.Infof(ctx, "sstables (read amplification = %d):\n%s", readAmp, sstables)
  2513  		log.Infof(ctx, "%s", s.engine.GetCompactionStats())
  2514  	}
  2515  	return nil
  2516  }
  2517  
  2518  // ClusterNodeCount returns this store's view of the number of nodes in the
  2519  // cluster. This is the metric used for adapative zone configs; ranges will not
  2520  // be reported as underreplicated if it is low. Tests that wait for full
  2521  // replication by tracking the underreplicated metric must also check for the
  2522  // expected ClusterNodeCount to avoid catching the cluster while the first node
  2523  // is initialized but the other nodes are not.
  2524  func (s *Store) ClusterNodeCount() int {
  2525  	return s.cfg.StorePool.ClusterNodeCount()
  2526  }
  2527  
  2528  // HotReplicaInfo contains a range descriptor and its QPS.
  2529  type HotReplicaInfo struct {
  2530  	Desc *roachpb.RangeDescriptor
  2531  	QPS  float64
  2532  }
  2533  
  2534  // HottestReplicas returns the hottest replicas on a store, sorted by their
  2535  // QPS. Only contains ranges for which this store is the leaseholder.
  2536  //
  2537  // Note that this uses cached information, so it's cheap but may be slightly
  2538  // out of date.
  2539  func (s *Store) HottestReplicas() []HotReplicaInfo {
  2540  	topQPS := s.replRankings.topQPS()
  2541  	hotRepls := make([]HotReplicaInfo, len(topQPS))
  2542  	for i := range topQPS {
  2543  		hotRepls[i].Desc = topQPS[i].repl.Desc()
  2544  		hotRepls[i].QPS = topQPS[i].qps
  2545  	}
  2546  	return hotRepls
  2547  }
  2548  
  2549  // StoreKeySpanStats carries the result of a stats computation over a key range.
  2550  type StoreKeySpanStats struct {
  2551  	ReplicaCount         int
  2552  	MVCC                 enginepb.MVCCStats
  2553  	ApproximateDiskBytes uint64
  2554  }
  2555  
  2556  // ComputeStatsForKeySpan computes the aggregated MVCCStats for all replicas on
  2557  // this store which contain any keys in the supplied range.
  2558  func (s *Store) ComputeStatsForKeySpan(startKey, endKey roachpb.RKey) (StoreKeySpanStats, error) {
  2559  	var result StoreKeySpanStats
  2560  
  2561  	newStoreReplicaVisitor(s).Visit(func(repl *Replica) bool {
  2562  		desc := repl.Desc()
  2563  		if bytes.Compare(startKey, desc.EndKey) >= 0 || bytes.Compare(desc.StartKey, endKey) >= 0 {
  2564  			return true // continue
  2565  		}
  2566  		result.MVCC.Add(repl.GetMVCCStats())
  2567  		result.ReplicaCount++
  2568  		return true
  2569  	})
  2570  
  2571  	var err error
  2572  	result.ApproximateDiskBytes, err = s.engine.ApproximateDiskBytes(startKey.AsRawKey(), endKey.AsRawKey())
  2573  	return result, err
  2574  }
  2575  
  2576  // AllocatorDryRun runs the given replica through the allocator without actually
  2577  // carrying out any changes, returning all trace messages collected along the way.
  2578  // Intended to help power a debug endpoint.
  2579  func (s *Store) AllocatorDryRun(ctx context.Context, repl *Replica) (tracing.Recording, error) {
  2580  	ctx, collect, cancel := tracing.ContextWithRecordingSpan(ctx, "allocator dry run")
  2581  	defer cancel()
  2582  	canTransferLease := func() bool { return true }
  2583  	_, err := s.replicateQueue.processOneChange(
  2584  		ctx, repl, canTransferLease, true /* dryRun */)
  2585  	if err != nil {
  2586  		log.Eventf(ctx, "error simulating allocator on replica %s: %s", repl, err)
  2587  	}
  2588  	return collect(), nil
  2589  }
  2590  
  2591  // ManuallyEnqueue runs the given replica through the requested queue,
  2592  // returning all trace events collected along the way as well as the error
  2593  // message returned from the queue's process method, if any.  Intended to help
  2594  // power an admin debug endpoint.
  2595  func (s *Store) ManuallyEnqueue(
  2596  	ctx context.Context, queueName string, repl *Replica, skipShouldQueue bool,
  2597  ) (recording tracing.Recording, processError error, enqueueError error) {
  2598  	ctx = repl.AnnotateCtx(ctx)
  2599  
  2600  	var queue queueImpl
  2601  	var needsLease bool
  2602  	for _, replicaQueue := range s.scanner.queues {
  2603  		if strings.EqualFold(replicaQueue.Name(), queueName) {
  2604  			queue = replicaQueue.(queueImpl)
  2605  			needsLease = replicaQueue.NeedsLease()
  2606  		}
  2607  	}
  2608  	if queue == nil {
  2609  		return nil, nil, errors.Errorf("unknown queue type %q", queueName)
  2610  	}
  2611  
  2612  	sysCfg := s.cfg.Gossip.GetSystemConfig()
  2613  	if sysCfg == nil {
  2614  		return nil, nil, errors.New("cannot run queue without a valid system config; make sure the cluster " +
  2615  			"has been initialized and all nodes connected to it")
  2616  	}
  2617  
  2618  	// Many queues are only meant to be run on leaseholder replicas, so attempt to
  2619  	// take the lease here or bail out early if a different replica has it.
  2620  	if needsLease {
  2621  		hasLease, pErr := repl.getLeaseForGossip(ctx)
  2622  		if pErr != nil {
  2623  			return nil, nil, pErr.GoError()
  2624  		}
  2625  		if !hasLease {
  2626  			return nil, errors.Newf("replica %v does not have the range lease", repl), nil
  2627  		}
  2628  	}
  2629  
  2630  	ctx, collect, cancel := tracing.ContextWithRecordingSpan(
  2631  		ctx, fmt.Sprintf("manual %s queue run", queueName))
  2632  	defer cancel()
  2633  
  2634  	if !skipShouldQueue {
  2635  		log.Eventf(ctx, "running %s.shouldQueue", queueName)
  2636  		shouldQueue, priority := queue.shouldQueue(ctx, s.cfg.Clock.Now(), repl, sysCfg)
  2637  		log.Eventf(ctx, "shouldQueue=%v, priority=%f", shouldQueue, priority)
  2638  		if !shouldQueue {
  2639  			return collect(), nil, nil
  2640  		}
  2641  	}
  2642  
  2643  	log.Eventf(ctx, "running %s.process", queueName)
  2644  	processErr := queue.process(ctx, repl, sysCfg)
  2645  	return collect(), processErr, nil
  2646  }
  2647  
  2648  // GetClusterVersion reads the the cluster version from the store-local version
  2649  // key. Returns an empty version if the key is not found.
  2650  func (s *Store) GetClusterVersion(ctx context.Context) (clusterversion.ClusterVersion, error) {
  2651  	return ReadClusterVersion(ctx, s.engine)
  2652  }
  2653  
  2654  // WriteClusterVersion writes the given cluster version to the store-local cluster version key.
  2655  func WriteClusterVersion(
  2656  	ctx context.Context, writer storage.ReadWriter, cv clusterversion.ClusterVersion,
  2657  ) error {
  2658  	return storage.MVCCPutProto(ctx, writer, nil, keys.StoreClusterVersionKey(), hlc.Timestamp{}, nil, &cv)
  2659  }
  2660  
  2661  // ReadClusterVersion reads the the cluster version from the store-local version key.
  2662  func ReadClusterVersion(
  2663  	ctx context.Context, reader storage.Reader,
  2664  ) (clusterversion.ClusterVersion, error) {
  2665  	var cv clusterversion.ClusterVersion
  2666  	_, err := storage.MVCCGetProto(ctx, reader, keys.StoreClusterVersionKey(), hlc.Timestamp{},
  2667  		&cv, storage.MVCCGetOptions{})
  2668  	return cv, err
  2669  }
  2670  
  2671  func init() {
  2672  	tracing.RegisterTagRemapping("s", "store")
  2673  }