github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/store_pool.go (about)

     1  // Copyright 2015 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"fmt"
    17  	"sort"
    18  	"time"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/config/zonepb"
    21  	"github.com/cockroachdb/cockroach/pkg/gossip"
    22  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
    23  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    24  	"github.com/cockroachdb/cockroach/pkg/settings"
    25  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    26  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    27  	"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
    28  	"github.com/cockroachdb/cockroach/pkg/util/log"
    29  	"github.com/cockroachdb/cockroach/pkg/util/shuffle"
    30  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    31  	"github.com/cockroachdb/errors"
    32  )
    33  
    34  const (
    35  	// TestTimeUntilStoreDead is the test value for TimeUntilStoreDead to
    36  	// quickly mark stores as dead.
    37  	TestTimeUntilStoreDead = 5 * time.Millisecond
    38  
    39  	// TestTimeUntilStoreDeadOff is the test value for TimeUntilStoreDead that
    40  	// prevents the store pool from marking stores as dead.
    41  	TestTimeUntilStoreDeadOff = 24 * time.Hour
    42  )
    43  
    44  // DeclinedReservationsTimeout specifies a duration during which the local
    45  // replicate queue will not consider stores which have rejected a reservation a
    46  // viable target.
    47  var DeclinedReservationsTimeout = settings.RegisterNonNegativeDurationSetting(
    48  	"server.declined_reservation_timeout",
    49  	"the amount of time to consider the store throttled for up-replication after a reservation was declined",
    50  	1*time.Second,
    51  )
    52  
    53  // FailedReservationsTimeout specifies a duration during which the local
    54  // replicate queue will not consider stores which have failed a reservation a
    55  // viable target.
    56  var FailedReservationsTimeout = settings.RegisterNonNegativeDurationSetting(
    57  	"server.failed_reservation_timeout",
    58  	"the amount of time to consider the store throttled for up-replication after a failed reservation call",
    59  	5*time.Second,
    60  )
    61  
    62  const timeUntilStoreDeadSettingName = "server.time_until_store_dead"
    63  
    64  // TimeUntilStoreDead wraps "server.time_until_store_dead".
    65  var TimeUntilStoreDead = func() *settings.DurationSetting {
    66  	s := settings.RegisterValidatedDurationSetting(
    67  		timeUntilStoreDeadSettingName,
    68  		"the time after which if there is no new gossiped information about a store, it is considered dead",
    69  		5*time.Minute,
    70  		func(v time.Duration) error {
    71  			// Setting this to less than the interval for gossiping stores is a big
    72  			// no-no, since this value is compared to the age of the most recent gossip
    73  			// from each store to determine whether that store is live. Put a buffer of
    74  			// 15 seconds on top to allow time for gossip to propagate.
    75  			const minTimeUntilStoreDead = gossip.StoresInterval + 15*time.Second
    76  			if v < minTimeUntilStoreDead {
    77  				return errors.Errorf("cannot set %s to less than %v: %v",
    78  					timeUntilStoreDeadSettingName, minTimeUntilStoreDead, v)
    79  			}
    80  			return nil
    81  		},
    82  	)
    83  	s.SetVisibility(settings.Public)
    84  	return s
    85  }()
    86  
    87  // The NodeCountFunc returns a count of the total number of nodes the user
    88  // intends for their to be in the cluster. The count includes dead nodes, but
    89  // not decommissioned nodes.
    90  type NodeCountFunc func() int
    91  
    92  // A NodeLivenessFunc accepts a node ID and current time and returns whether or
    93  // not the node is live. A node is considered dead if its liveness record has
    94  // expired by more than TimeUntilStoreDead.
    95  type NodeLivenessFunc func(
    96  	nid roachpb.NodeID, now time.Time, timeUntilStoreDead time.Duration,
    97  ) kvserverpb.NodeLivenessStatus
    98  
    99  // MakeStorePoolNodeLivenessFunc returns a function which determines
   100  // the status of a node based on information provided by the specified
   101  // NodeLiveness.
   102  func MakeStorePoolNodeLivenessFunc(nodeLiveness *NodeLiveness) NodeLivenessFunc {
   103  	return func(
   104  		nodeID roachpb.NodeID, now time.Time, timeUntilStoreDead time.Duration,
   105  	) kvserverpb.NodeLivenessStatus {
   106  		liveness, err := nodeLiveness.GetLiveness(nodeID)
   107  		if err != nil {
   108  			return kvserverpb.NodeLivenessStatus_UNAVAILABLE
   109  		}
   110  		return LivenessStatus(liveness, now, timeUntilStoreDead)
   111  	}
   112  }
   113  
   114  // LivenessStatus returns a NodeLivenessStatus enumeration value for the
   115  // provided Liveness based on the provided timestamp and threshold.
   116  //
   117  // See the note on IsLive() for considerations on what should be passed in as
   118  // `now`.
   119  //
   120  // The timeline of the states that a liveness goes through as time passes after
   121  // the respective liveness record is written is the following:
   122  //
   123  //  -----|-------LIVE---|------UNAVAILABLE---|------DEAD------------> time
   124  //       tWrite         tExp                 tExp+threshold
   125  //
   126  // Explanation:
   127  //
   128  //  - Let's say a node write its liveness record at tWrite. It sets the
   129  //    Expiration field of the record as tExp=tWrite+livenessThreshold.
   130  //    The node is considered LIVE (or DECOMISSIONING or UNAVAILABLE if draining).
   131  //  - At tExp, the IsLive() method starts returning false. The state becomes
   132  //    UNAVAILABLE (or stays DECOMISSIONING or UNAVAILABLE if draining).
   133  //  - Once threshold passes, the node is considered DEAD (or DECOMMISSIONED).
   134  func LivenessStatus(
   135  	l kvserverpb.Liveness, now time.Time, deadThreshold time.Duration,
   136  ) kvserverpb.NodeLivenessStatus {
   137  	if l.IsDead(now, deadThreshold) {
   138  		if l.Decommissioning {
   139  			return kvserverpb.NodeLivenessStatus_DECOMMISSIONED
   140  		}
   141  		return kvserverpb.NodeLivenessStatus_DEAD
   142  	}
   143  	if l.Decommissioning {
   144  		return kvserverpb.NodeLivenessStatus_DECOMMISSIONING
   145  	}
   146  	if l.Draining {
   147  		return kvserverpb.NodeLivenessStatus_UNAVAILABLE
   148  	}
   149  	if l.IsLive(now) {
   150  		return kvserverpb.NodeLivenessStatus_LIVE
   151  	}
   152  	return kvserverpb.NodeLivenessStatus_UNAVAILABLE
   153  }
   154  
   155  type storeDetail struct {
   156  	desc *roachpb.StoreDescriptor
   157  	// throttledUntil is when a throttled store can be considered available again
   158  	// due to a failed or declined snapshot.
   159  	throttledUntil time.Time
   160  	// throttledBecause is set to the most recent reason for which a store was
   161  	// marked as throttled.
   162  	throttledBecause string
   163  	// lastUpdatedTime is set when a store is first consulted and every time
   164  	// gossip arrives for a store.
   165  	lastUpdatedTime time.Time
   166  }
   167  
   168  // isThrottled returns whether the store is currently throttled.
   169  func (sd storeDetail) isThrottled(now time.Time) bool {
   170  	return sd.throttledUntil.After(now)
   171  }
   172  
   173  // storeStatus is the current status of a store.
   174  type storeStatus int
   175  
   176  // These are the possible values for a storeStatus.
   177  const (
   178  	_ storeStatus = iota
   179  	// The store's node is not live or no gossip has been received from
   180  	// the store for more than the timeUntilStoreDead threshold.
   181  	storeStatusDead
   182  	// The store isn't available because it hasn't gossiped yet. This
   183  	// status lasts until either gossip is received from the store or
   184  	// the timeUntilStoreDead threshold has passed, at which point its
   185  	// status will change to dead.
   186  	storeStatusUnknown
   187  	// The store is alive but it is throttled.
   188  	storeStatusThrottled
   189  	// The store is alive and available.
   190  	storeStatusAvailable
   191  	// The store is decommissioning.
   192  	storeStatusDecommissioning
   193  )
   194  
   195  func (sd *storeDetail) status(
   196  	now time.Time, threshold time.Duration, nl NodeLivenessFunc,
   197  ) storeStatus {
   198  	// The store is considered dead if it hasn't been updated via gossip
   199  	// within the liveness threshold. Note that lastUpdatedTime is set
   200  	// when the store detail is created and will have a non-zero value
   201  	// even before the first gossip arrives for a store.
   202  	deadAsOf := sd.lastUpdatedTime.Add(threshold)
   203  	if now.After(deadAsOf) {
   204  		return storeStatusDead
   205  	}
   206  	// If there's no descriptor (meaning no gossip ever arrived for this
   207  	// store), return unavailable.
   208  	if sd.desc == nil {
   209  		return storeStatusUnknown
   210  	}
   211  
   212  	// Even if the store has been updated via gossip, we still rely on
   213  	// the node liveness to determine whether it is considered live.
   214  	switch nl(sd.desc.Node.NodeID, now, threshold) {
   215  	case kvserverpb.NodeLivenessStatus_DEAD, kvserverpb.NodeLivenessStatus_DECOMMISSIONED:
   216  		return storeStatusDead
   217  	case kvserverpb.NodeLivenessStatus_DECOMMISSIONING:
   218  		return storeStatusDecommissioning
   219  	case kvserverpb.NodeLivenessStatus_UNKNOWN, kvserverpb.NodeLivenessStatus_UNAVAILABLE:
   220  		return storeStatusUnknown
   221  	}
   222  
   223  	if sd.isThrottled(now) {
   224  		return storeStatusThrottled
   225  	}
   226  
   227  	return storeStatusAvailable
   228  }
   229  
   230  // localityWithString maintains a string representation of each locality along
   231  // with its protocol buffer implementation. This is for the sake of optimizing
   232  // memory usage by allocating a single copy of each that can be returned to
   233  // callers of getNodeLocalityString rather than each caller (which is currently
   234  // each replica in the local store) making its own copy.
   235  type localityWithString struct {
   236  	locality roachpb.Locality
   237  	str      string
   238  }
   239  
   240  // StorePool maintains a list of all known stores in the cluster and
   241  // information on their health.
   242  type StorePool struct {
   243  	log.AmbientContext
   244  	st *cluster.Settings
   245  
   246  	clock          *hlc.Clock
   247  	gossip         *gossip.Gossip
   248  	nodeCountFn    NodeCountFunc
   249  	nodeLivenessFn NodeLivenessFunc
   250  	startTime      time.Time
   251  	deterministic  bool
   252  	// We use separate mutexes for storeDetails and nodeLocalities because the
   253  	// nodeLocalities map is used in the critical code path of Replica.Send()
   254  	// and we'd rather not block that on something less important accessing
   255  	// storeDetails.
   256  	detailsMu struct {
   257  		syncutil.RWMutex
   258  		storeDetails map[roachpb.StoreID]*storeDetail
   259  	}
   260  	localitiesMu struct {
   261  		syncutil.RWMutex
   262  		nodeLocalities map[roachpb.NodeID]localityWithString
   263  	}
   264  }
   265  
   266  // NewStorePool creates a StorePool and registers the store updating callback
   267  // with gossip.
   268  func NewStorePool(
   269  	ambient log.AmbientContext,
   270  	st *cluster.Settings,
   271  	g *gossip.Gossip,
   272  	clock *hlc.Clock,
   273  	nodeCountFn NodeCountFunc,
   274  	nodeLivenessFn NodeLivenessFunc,
   275  	deterministic bool,
   276  ) *StorePool {
   277  	sp := &StorePool{
   278  		AmbientContext: ambient,
   279  		st:             st,
   280  		clock:          clock,
   281  		gossip:         g,
   282  		nodeCountFn:    nodeCountFn,
   283  		nodeLivenessFn: nodeLivenessFn,
   284  		startTime:      clock.PhysicalTime(),
   285  		deterministic:  deterministic,
   286  	}
   287  	sp.detailsMu.storeDetails = make(map[roachpb.StoreID]*storeDetail)
   288  	sp.localitiesMu.nodeLocalities = make(map[roachpb.NodeID]localityWithString)
   289  
   290  	// Enable redundant callbacks for the store keys because we use these
   291  	// callbacks as a clock to determine when a store was last updated even if it
   292  	// hasn't otherwise changed.
   293  	storeRegex := gossip.MakePrefixPattern(gossip.KeyStorePrefix)
   294  	g.RegisterCallback(storeRegex, sp.storeGossipUpdate, gossip.Redundant)
   295  
   296  	return sp
   297  }
   298  
   299  func (sp *StorePool) String() string {
   300  	sp.detailsMu.RLock()
   301  	defer sp.detailsMu.RUnlock()
   302  
   303  	ids := make(roachpb.StoreIDSlice, 0, len(sp.detailsMu.storeDetails))
   304  	for id := range sp.detailsMu.storeDetails {
   305  		ids = append(ids, id)
   306  	}
   307  	sort.Sort(ids)
   308  
   309  	var buf bytes.Buffer
   310  	now := sp.clock.Now().GoTime()
   311  	timeUntilStoreDead := TimeUntilStoreDead.Get(&sp.st.SV)
   312  
   313  	for _, id := range ids {
   314  		detail := sp.detailsMu.storeDetails[id]
   315  		fmt.Fprintf(&buf, "%d", id)
   316  		status := detail.status(now, timeUntilStoreDead, sp.nodeLivenessFn)
   317  		if status != storeStatusAvailable {
   318  			fmt.Fprintf(&buf, " (status=%d)", status)
   319  		}
   320  		if detail.desc != nil {
   321  			fmt.Fprintf(&buf, ": range-count=%d fraction-used=%.2f",
   322  				detail.desc.Capacity.RangeCount, detail.desc.Capacity.FractionUsed())
   323  		}
   324  		throttled := detail.throttledUntil.Sub(now)
   325  		if throttled > 0 {
   326  			fmt.Fprintf(&buf, " [throttled=%.1fs]", throttled.Seconds())
   327  		}
   328  		_, _ = buf.WriteString("\n")
   329  	}
   330  	return buf.String()
   331  }
   332  
   333  // storeGossipUpdate is the gossip callback used to keep the StorePool up to date.
   334  func (sp *StorePool) storeGossipUpdate(_ string, content roachpb.Value) {
   335  	var storeDesc roachpb.StoreDescriptor
   336  	if err := content.GetProto(&storeDesc); err != nil {
   337  		ctx := sp.AnnotateCtx(context.TODO())
   338  		log.Errorf(ctx, "%v", err)
   339  		return
   340  	}
   341  
   342  	sp.detailsMu.Lock()
   343  	detail := sp.getStoreDetailLocked(storeDesc.StoreID)
   344  	detail.desc = &storeDesc
   345  	detail.lastUpdatedTime = sp.clock.PhysicalTime()
   346  	sp.detailsMu.Unlock()
   347  
   348  	sp.localitiesMu.Lock()
   349  	sp.localitiesMu.nodeLocalities[storeDesc.Node.NodeID] =
   350  		localityWithString{storeDesc.Node.Locality, storeDesc.Node.Locality.String()}
   351  	sp.localitiesMu.Unlock()
   352  }
   353  
   354  // updateLocalStoreAfterRebalance is used to update the local copy of the
   355  // target store immediately after a replica addition or removal.
   356  func (sp *StorePool) updateLocalStoreAfterRebalance(
   357  	storeID roachpb.StoreID, rangeUsageInfo RangeUsageInfo, changeType roachpb.ReplicaChangeType,
   358  ) {
   359  	sp.detailsMu.Lock()
   360  	defer sp.detailsMu.Unlock()
   361  	detail := *sp.getStoreDetailLocked(storeID)
   362  	if detail.desc == nil {
   363  		// We don't have this store yet (this is normal when we're
   364  		// starting up and don't have full information from the gossip
   365  		// network). We can't update the local store at this time.
   366  		return
   367  	}
   368  	switch changeType {
   369  	case roachpb.ADD_REPLICA:
   370  		detail.desc.Capacity.RangeCount++
   371  		detail.desc.Capacity.LogicalBytes += rangeUsageInfo.LogicalBytes
   372  		detail.desc.Capacity.WritesPerSecond += rangeUsageInfo.WritesPerSecond
   373  	case roachpb.REMOVE_REPLICA:
   374  		detail.desc.Capacity.RangeCount--
   375  		if detail.desc.Capacity.LogicalBytes <= rangeUsageInfo.LogicalBytes {
   376  			detail.desc.Capacity.LogicalBytes = 0
   377  		} else {
   378  			detail.desc.Capacity.LogicalBytes -= rangeUsageInfo.LogicalBytes
   379  		}
   380  		if detail.desc.Capacity.WritesPerSecond <= rangeUsageInfo.WritesPerSecond {
   381  			detail.desc.Capacity.WritesPerSecond = 0
   382  		} else {
   383  			detail.desc.Capacity.WritesPerSecond -= rangeUsageInfo.WritesPerSecond
   384  		}
   385  	}
   386  	sp.detailsMu.storeDetails[storeID] = &detail
   387  }
   388  
   389  // updateLocalStoresAfterLeaseTransfer is used to update the local copies of the
   390  // involved store descriptors immediately after a lease transfer.
   391  func (sp *StorePool) updateLocalStoresAfterLeaseTransfer(
   392  	from roachpb.StoreID, to roachpb.StoreID, rangeQPS float64,
   393  ) {
   394  	sp.detailsMu.Lock()
   395  	defer sp.detailsMu.Unlock()
   396  
   397  	fromDetail := *sp.getStoreDetailLocked(from)
   398  	if fromDetail.desc != nil {
   399  		fromDetail.desc.Capacity.LeaseCount--
   400  		if fromDetail.desc.Capacity.QueriesPerSecond < rangeQPS {
   401  			fromDetail.desc.Capacity.QueriesPerSecond = 0
   402  		} else {
   403  			fromDetail.desc.Capacity.QueriesPerSecond -= rangeQPS
   404  		}
   405  		sp.detailsMu.storeDetails[from] = &fromDetail
   406  	}
   407  
   408  	toDetail := *sp.getStoreDetailLocked(to)
   409  	if toDetail.desc != nil {
   410  		toDetail.desc.Capacity.LeaseCount++
   411  		toDetail.desc.Capacity.QueriesPerSecond += rangeQPS
   412  		sp.detailsMu.storeDetails[to] = &toDetail
   413  	}
   414  }
   415  
   416  // newStoreDetail makes a new storeDetail struct. It sets index to be -1 to
   417  // ensure that it will be processed by a queue immediately.
   418  func newStoreDetail() *storeDetail {
   419  	return &storeDetail{}
   420  }
   421  
   422  // GetStores returns information on all the stores with descriptor in the pool.
   423  // Stores without descriptor (a node that didn't come up yet after a cluster
   424  // restart) will not be part of the returned set.
   425  func (sp *StorePool) GetStores() map[roachpb.StoreID]roachpb.StoreDescriptor {
   426  	sp.detailsMu.RLock()
   427  	defer sp.detailsMu.RUnlock()
   428  	stores := make(map[roachpb.StoreID]roachpb.StoreDescriptor, len(sp.detailsMu.storeDetails))
   429  	for _, s := range sp.detailsMu.storeDetails {
   430  		if s.desc != nil {
   431  			stores[s.desc.StoreID] = *s.desc
   432  		}
   433  	}
   434  	return stores
   435  }
   436  
   437  // getStoreDetailLocked returns the store detail for the given storeID.
   438  // The lock must be held *in write mode* even though this looks like a
   439  // read-only method.
   440  func (sp *StorePool) getStoreDetailLocked(storeID roachpb.StoreID) *storeDetail {
   441  	detail, ok := sp.detailsMu.storeDetails[storeID]
   442  	if !ok {
   443  		// We don't have this store yet (this is normal when we're
   444  		// starting up and don't have full information from the gossip
   445  		// network). The first time this occurs, presume the store is
   446  		// alive, but start the clock so it will become dead if enough
   447  		// time passes without updates from gossip.
   448  		detail = newStoreDetail()
   449  		detail.lastUpdatedTime = sp.startTime
   450  		sp.detailsMu.storeDetails[storeID] = detail
   451  	}
   452  	return detail
   453  }
   454  
   455  // getStoreDescriptor returns the latest store descriptor for the given
   456  // storeID.
   457  func (sp *StorePool) getStoreDescriptor(storeID roachpb.StoreID) (roachpb.StoreDescriptor, bool) {
   458  	sp.detailsMu.RLock()
   459  	defer sp.detailsMu.RUnlock()
   460  
   461  	if detail, ok := sp.detailsMu.storeDetails[storeID]; ok && detail.desc != nil {
   462  		return *detail.desc, true
   463  	}
   464  	return roachpb.StoreDescriptor{}, false
   465  }
   466  
   467  // decommissioningReplicas filters out replicas on decommissioning node/store
   468  // from the provided repls and returns them in a slice.
   469  func (sp *StorePool) decommissioningReplicas(
   470  	repls []roachpb.ReplicaDescriptor,
   471  ) (decommissioningReplicas []roachpb.ReplicaDescriptor) {
   472  	sp.detailsMu.Lock()
   473  	defer sp.detailsMu.Unlock()
   474  
   475  	// NB: We use clock.Now().GoTime() instead of clock.PhysicalTime() is order to
   476  	// take clock signals from remote nodes into consideration.
   477  	now := sp.clock.Now().GoTime()
   478  	timeUntilStoreDead := TimeUntilStoreDead.Get(&sp.st.SV)
   479  
   480  	for _, repl := range repls {
   481  		detail := sp.getStoreDetailLocked(repl.StoreID)
   482  		switch detail.status(now, timeUntilStoreDead, sp.nodeLivenessFn) {
   483  		case storeStatusDecommissioning:
   484  			decommissioningReplicas = append(decommissioningReplicas, repl)
   485  		}
   486  	}
   487  	return
   488  }
   489  
   490  // ClusterNodeCount returns the number of nodes that are possible allocation
   491  // targets. This includes dead nodes, but not decommissioning or decommissioned
   492  // nodes.
   493  func (sp *StorePool) ClusterNodeCount() int {
   494  	return sp.nodeCountFn()
   495  }
   496  
   497  // liveAndDeadReplicas divides the provided repls slice into two slices: the
   498  // first for live replicas, and the second for dead replicas.
   499  // Replicas for which liveness or deadness cannot be ascertained are excluded
   500  // from the returned slices.  Replicas on decommissioning node/store are
   501  // considered live.
   502  func (sp *StorePool) liveAndDeadReplicas(
   503  	repls []roachpb.ReplicaDescriptor,
   504  ) (liveReplicas, deadReplicas []roachpb.ReplicaDescriptor) {
   505  	sp.detailsMu.Lock()
   506  	defer sp.detailsMu.Unlock()
   507  
   508  	now := sp.clock.Now().GoTime()
   509  	timeUntilStoreDead := TimeUntilStoreDead.Get(&sp.st.SV)
   510  
   511  	for _, repl := range repls {
   512  		detail := sp.getStoreDetailLocked(repl.StoreID)
   513  		// Mark replica as dead if store is dead.
   514  		status := detail.status(now, timeUntilStoreDead, sp.nodeLivenessFn)
   515  		switch status {
   516  		case storeStatusDead:
   517  			deadReplicas = append(deadReplicas, repl)
   518  		case storeStatusAvailable, storeStatusThrottled, storeStatusDecommissioning:
   519  			// We count both available and throttled stores to be live for the
   520  			// purpose of computing quorum.
   521  			// We count decommissioning replicas to be alive because they are readable
   522  			// and should be used for up-replication if necessary.
   523  			liveReplicas = append(liveReplicas, repl)
   524  		case storeStatusUnknown:
   525  		// No-op.
   526  		default:
   527  			log.Fatalf(context.TODO(), "unknown store status %d", status)
   528  		}
   529  	}
   530  	return
   531  }
   532  
   533  // stat provides a running sample size and running stats.
   534  type stat struct {
   535  	n, mean float64
   536  }
   537  
   538  // Update adds the specified value to the stat, augmenting the running stats.
   539  func (s *stat) update(x float64) {
   540  	s.n++
   541  	s.mean += (x - s.mean) / s.n
   542  }
   543  
   544  // StoreList holds a list of store descriptors and associated count and used
   545  // stats for those stores.
   546  type StoreList struct {
   547  	stores []roachpb.StoreDescriptor
   548  
   549  	// candidateRanges tracks range count stats for stores that are eligible to
   550  	// be rebalance targets (their used capacity percentage must be lower than
   551  	// maxFractionUsedThreshold).
   552  	candidateRanges stat
   553  
   554  	// candidateLeases tracks range lease stats for stores that are eligible to
   555  	// be rebalance targets.
   556  	candidateLeases stat
   557  
   558  	// candidateLogicalBytes tracks disk usage stats for stores that are eligible
   559  	// to be rebalance targets.
   560  	candidateLogicalBytes stat
   561  
   562  	// candidateQueriesPerSecond tracks queries-per-second stats for stores that
   563  	// are eligible to be rebalance targets.
   564  	candidateQueriesPerSecond stat
   565  
   566  	// candidateWritesPerSecond tracks writes-per-second stats for stores that are
   567  	// eligible to be rebalance targets.
   568  	candidateWritesPerSecond stat
   569  }
   570  
   571  // Generates a new store list based on the passed in descriptors. It will
   572  // maintain the order of those descriptors.
   573  func makeStoreList(descriptors []roachpb.StoreDescriptor) StoreList {
   574  	sl := StoreList{stores: descriptors}
   575  	for _, desc := range descriptors {
   576  		if maxCapacityCheck(desc) {
   577  			sl.candidateRanges.update(float64(desc.Capacity.RangeCount))
   578  		}
   579  		sl.candidateLeases.update(float64(desc.Capacity.LeaseCount))
   580  		sl.candidateLogicalBytes.update(float64(desc.Capacity.LogicalBytes))
   581  		sl.candidateQueriesPerSecond.update(desc.Capacity.QueriesPerSecond)
   582  		sl.candidateWritesPerSecond.update(desc.Capacity.WritesPerSecond)
   583  	}
   584  	return sl
   585  }
   586  
   587  func (sl StoreList) String() string {
   588  	var buf bytes.Buffer
   589  	fmt.Fprintf(&buf,
   590  		"  candidate: avg-ranges=%v avg-leases=%v avg-disk-usage=%v avg-queries-per-second=%v",
   591  		sl.candidateRanges.mean,
   592  		sl.candidateLeases.mean,
   593  		humanizeutil.IBytes(int64(sl.candidateLogicalBytes.mean)),
   594  		sl.candidateQueriesPerSecond.mean)
   595  	if len(sl.stores) > 0 {
   596  		fmt.Fprintf(&buf, "\n")
   597  	} else {
   598  		fmt.Fprintf(&buf, " <no candidates>")
   599  	}
   600  	for _, desc := range sl.stores {
   601  		fmt.Fprintf(&buf, "  %d: ranges=%d leases=%d disk-usage=%s queries-per-second=%.2f\n",
   602  			desc.StoreID, desc.Capacity.RangeCount,
   603  			desc.Capacity.LeaseCount, humanizeutil.IBytes(desc.Capacity.LogicalBytes),
   604  			desc.Capacity.QueriesPerSecond)
   605  	}
   606  	return buf.String()
   607  }
   608  
   609  // filter takes a store list and filters it using the passed in constraints. It
   610  // maintains the original order of the passed in store list.
   611  func (sl StoreList) filter(constraints []zonepb.ConstraintsConjunction) StoreList {
   612  	if len(constraints) == 0 {
   613  		return sl
   614  	}
   615  	var filteredDescs []roachpb.StoreDescriptor
   616  	for _, store := range sl.stores {
   617  		if ok := constraintsCheck(store, constraints); ok {
   618  			filteredDescs = append(filteredDescs, store)
   619  		}
   620  	}
   621  	return makeStoreList(filteredDescs)
   622  }
   623  
   624  type storeFilter int
   625  
   626  const (
   627  	_ storeFilter = iota
   628  	// storeFilterNone requests that the storeList include all live stores. Dead,
   629  	// unknown, and corrupted stores are always excluded from the storeList.
   630  	storeFilterNone
   631  	// storeFilterThrottled requests that the returned store list additionally
   632  	// exclude stores that have been throttled for declining a snapshot. (See
   633  	// storePool.throttle for details.) Throttled stores should not be considered
   634  	// for replica rebalancing, for example, but can still be considered for lease
   635  	// rebalancing.
   636  	storeFilterThrottled
   637  )
   638  
   639  type throttledStoreReasons []string
   640  
   641  // getStoreList returns a storeList that contains all active stores that contain
   642  // the required attributes and their associated stats. The storeList is filtered
   643  // according to the provided storeFilter. It also returns the total number of
   644  // alive and throttled stores.
   645  func (sp *StorePool) getStoreList(filter storeFilter) (StoreList, int, throttledStoreReasons) {
   646  	sp.detailsMu.RLock()
   647  	defer sp.detailsMu.RUnlock()
   648  
   649  	var storeIDs roachpb.StoreIDSlice
   650  	for storeID := range sp.detailsMu.storeDetails {
   651  		storeIDs = append(storeIDs, storeID)
   652  	}
   653  	return sp.getStoreListFromIDsRLocked(storeIDs, filter)
   654  }
   655  
   656  // getStoreListFromIDs is the same function as getStoreList but only returns stores
   657  // from the subset of passed in store IDs.
   658  func (sp *StorePool) getStoreListFromIDs(
   659  	storeIDs roachpb.StoreIDSlice, filter storeFilter,
   660  ) (StoreList, int, throttledStoreReasons) {
   661  	sp.detailsMu.RLock()
   662  	defer sp.detailsMu.RUnlock()
   663  	return sp.getStoreListFromIDsRLocked(storeIDs, filter)
   664  }
   665  
   666  // getStoreListFromIDsRLocked is the same function as getStoreList but requires
   667  // that the detailsMU read lock is held.
   668  func (sp *StorePool) getStoreListFromIDsRLocked(
   669  	storeIDs roachpb.StoreIDSlice, filter storeFilter,
   670  ) (StoreList, int, throttledStoreReasons) {
   671  	if sp.deterministic {
   672  		sort.Sort(storeIDs)
   673  	} else {
   674  		shuffle.Shuffle(storeIDs)
   675  	}
   676  
   677  	var aliveStoreCount int
   678  	var throttled throttledStoreReasons
   679  	var storeDescriptors []roachpb.StoreDescriptor
   680  
   681  	now := sp.clock.Now().GoTime()
   682  	timeUntilStoreDead := TimeUntilStoreDead.Get(&sp.st.SV)
   683  
   684  	for _, storeID := range storeIDs {
   685  		detail, ok := sp.detailsMu.storeDetails[storeID]
   686  		if !ok {
   687  			// Do nothing; this store is not in the StorePool.
   688  			continue
   689  		}
   690  		switch s := detail.status(now, timeUntilStoreDead, sp.nodeLivenessFn); s {
   691  		case storeStatusThrottled:
   692  			aliveStoreCount++
   693  			throttled = append(throttled, detail.throttledBecause)
   694  			if filter != storeFilterThrottled {
   695  				storeDescriptors = append(storeDescriptors, *detail.desc)
   696  			}
   697  		case storeStatusAvailable:
   698  			aliveStoreCount++
   699  			storeDescriptors = append(storeDescriptors, *detail.desc)
   700  		case storeStatusDead, storeStatusUnknown, storeStatusDecommissioning:
   701  			// Do nothing; this store cannot be used.
   702  		default:
   703  			panic(fmt.Sprintf("unknown store status: %d", s))
   704  		}
   705  	}
   706  	return makeStoreList(storeDescriptors), aliveStoreCount, throttled
   707  }
   708  
   709  type throttleReason int
   710  
   711  const (
   712  	_ throttleReason = iota
   713  	throttleDeclined
   714  	throttleFailed
   715  )
   716  
   717  // throttle informs the store pool that the given remote store declined a
   718  // snapshot or failed to apply one, ensuring that it will not be considered
   719  // for up-replication or rebalancing until after the configured timeout period
   720  // has elapsed. Declined being true indicates that the remote store explicitly
   721  // declined a snapshot.
   722  func (sp *StorePool) throttle(reason throttleReason, why string, storeID roachpb.StoreID) {
   723  	sp.detailsMu.Lock()
   724  	defer sp.detailsMu.Unlock()
   725  	detail := sp.getStoreDetailLocked(storeID)
   726  	detail.throttledBecause = why
   727  
   728  	// If a snapshot is declined, be it due to an error or because it was
   729  	// rejected, we mark the store detail as having been declined so it won't
   730  	// be considered as a candidate for new replicas until after the configured
   731  	// timeout period has passed.
   732  	switch reason {
   733  	case throttleDeclined:
   734  		timeout := DeclinedReservationsTimeout.Get(&sp.st.SV)
   735  		detail.throttledUntil = sp.clock.PhysicalTime().Add(timeout)
   736  		if log.V(2) {
   737  			ctx := sp.AnnotateCtx(context.TODO())
   738  			log.Infof(ctx, "snapshot declined (%s), s%d will be throttled for %s until %s",
   739  				why, storeID, timeout, detail.throttledUntil)
   740  		}
   741  	case throttleFailed:
   742  		timeout := FailedReservationsTimeout.Get(&sp.st.SV)
   743  		detail.throttledUntil = sp.clock.PhysicalTime().Add(timeout)
   744  		if log.V(2) {
   745  			ctx := sp.AnnotateCtx(context.TODO())
   746  			log.Infof(ctx, "snapshot failed (%s), s%d will be throttled for %s until %s",
   747  				why, storeID, timeout, detail.throttledUntil)
   748  		}
   749  	}
   750  }
   751  
   752  // getLocalities returns the localities for the provided replicas.
   753  // TODO(bram): consider storing a full list of all node to node diversity
   754  // scores for faster lookups.
   755  func (sp *StorePool) getLocalities(
   756  	replicas []roachpb.ReplicaDescriptor,
   757  ) map[roachpb.NodeID]roachpb.Locality {
   758  	sp.localitiesMu.RLock()
   759  	defer sp.localitiesMu.RUnlock()
   760  	localities := make(map[roachpb.NodeID]roachpb.Locality)
   761  	for _, replica := range replicas {
   762  		if locality, ok := sp.localitiesMu.nodeLocalities[replica.NodeID]; ok {
   763  			localities[replica.NodeID] = locality.locality
   764  		} else {
   765  			localities[replica.NodeID] = roachpb.Locality{}
   766  		}
   767  	}
   768  	return localities
   769  }
   770  
   771  // getNodeLocalityString returns the locality information for the given node
   772  // in its string format.
   773  func (sp *StorePool) getNodeLocalityString(nodeID roachpb.NodeID) string {
   774  	sp.localitiesMu.RLock()
   775  	defer sp.localitiesMu.RUnlock()
   776  	locality, ok := sp.localitiesMu.nodeLocalities[nodeID]
   777  	if !ok {
   778  		return ""
   779  	}
   780  	return locality.str
   781  }