github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/store_pool.go (about) 1 // Copyright 2015 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "bytes" 15 "context" 16 "fmt" 17 "sort" 18 "time" 19 20 "github.com/cockroachdb/cockroach/pkg/config/zonepb" 21 "github.com/cockroachdb/cockroach/pkg/gossip" 22 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" 23 "github.com/cockroachdb/cockroach/pkg/roachpb" 24 "github.com/cockroachdb/cockroach/pkg/settings" 25 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 26 "github.com/cockroachdb/cockroach/pkg/util/hlc" 27 "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" 28 "github.com/cockroachdb/cockroach/pkg/util/log" 29 "github.com/cockroachdb/cockroach/pkg/util/shuffle" 30 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 31 "github.com/cockroachdb/errors" 32 ) 33 34 const ( 35 // TestTimeUntilStoreDead is the test value for TimeUntilStoreDead to 36 // quickly mark stores as dead. 37 TestTimeUntilStoreDead = 5 * time.Millisecond 38 39 // TestTimeUntilStoreDeadOff is the test value for TimeUntilStoreDead that 40 // prevents the store pool from marking stores as dead. 41 TestTimeUntilStoreDeadOff = 24 * time.Hour 42 ) 43 44 // DeclinedReservationsTimeout specifies a duration during which the local 45 // replicate queue will not consider stores which have rejected a reservation a 46 // viable target. 47 var DeclinedReservationsTimeout = settings.RegisterNonNegativeDurationSetting( 48 "server.declined_reservation_timeout", 49 "the amount of time to consider the store throttled for up-replication after a reservation was declined", 50 1*time.Second, 51 ) 52 53 // FailedReservationsTimeout specifies a duration during which the local 54 // replicate queue will not consider stores which have failed a reservation a 55 // viable target. 56 var FailedReservationsTimeout = settings.RegisterNonNegativeDurationSetting( 57 "server.failed_reservation_timeout", 58 "the amount of time to consider the store throttled for up-replication after a failed reservation call", 59 5*time.Second, 60 ) 61 62 const timeUntilStoreDeadSettingName = "server.time_until_store_dead" 63 64 // TimeUntilStoreDead wraps "server.time_until_store_dead". 65 var TimeUntilStoreDead = func() *settings.DurationSetting { 66 s := settings.RegisterValidatedDurationSetting( 67 timeUntilStoreDeadSettingName, 68 "the time after which if there is no new gossiped information about a store, it is considered dead", 69 5*time.Minute, 70 func(v time.Duration) error { 71 // Setting this to less than the interval for gossiping stores is a big 72 // no-no, since this value is compared to the age of the most recent gossip 73 // from each store to determine whether that store is live. Put a buffer of 74 // 15 seconds on top to allow time for gossip to propagate. 75 const minTimeUntilStoreDead = gossip.StoresInterval + 15*time.Second 76 if v < minTimeUntilStoreDead { 77 return errors.Errorf("cannot set %s to less than %v: %v", 78 timeUntilStoreDeadSettingName, minTimeUntilStoreDead, v) 79 } 80 return nil 81 }, 82 ) 83 s.SetVisibility(settings.Public) 84 return s 85 }() 86 87 // The NodeCountFunc returns a count of the total number of nodes the user 88 // intends for their to be in the cluster. The count includes dead nodes, but 89 // not decommissioned nodes. 90 type NodeCountFunc func() int 91 92 // A NodeLivenessFunc accepts a node ID and current time and returns whether or 93 // not the node is live. A node is considered dead if its liveness record has 94 // expired by more than TimeUntilStoreDead. 95 type NodeLivenessFunc func( 96 nid roachpb.NodeID, now time.Time, timeUntilStoreDead time.Duration, 97 ) kvserverpb.NodeLivenessStatus 98 99 // MakeStorePoolNodeLivenessFunc returns a function which determines 100 // the status of a node based on information provided by the specified 101 // NodeLiveness. 102 func MakeStorePoolNodeLivenessFunc(nodeLiveness *NodeLiveness) NodeLivenessFunc { 103 return func( 104 nodeID roachpb.NodeID, now time.Time, timeUntilStoreDead time.Duration, 105 ) kvserverpb.NodeLivenessStatus { 106 liveness, err := nodeLiveness.GetLiveness(nodeID) 107 if err != nil { 108 return kvserverpb.NodeLivenessStatus_UNAVAILABLE 109 } 110 return LivenessStatus(liveness, now, timeUntilStoreDead) 111 } 112 } 113 114 // LivenessStatus returns a NodeLivenessStatus enumeration value for the 115 // provided Liveness based on the provided timestamp and threshold. 116 // 117 // See the note on IsLive() for considerations on what should be passed in as 118 // `now`. 119 // 120 // The timeline of the states that a liveness goes through as time passes after 121 // the respective liveness record is written is the following: 122 // 123 // -----|-------LIVE---|------UNAVAILABLE---|------DEAD------------> time 124 // tWrite tExp tExp+threshold 125 // 126 // Explanation: 127 // 128 // - Let's say a node write its liveness record at tWrite. It sets the 129 // Expiration field of the record as tExp=tWrite+livenessThreshold. 130 // The node is considered LIVE (or DECOMISSIONING or UNAVAILABLE if draining). 131 // - At tExp, the IsLive() method starts returning false. The state becomes 132 // UNAVAILABLE (or stays DECOMISSIONING or UNAVAILABLE if draining). 133 // - Once threshold passes, the node is considered DEAD (or DECOMMISSIONED). 134 func LivenessStatus( 135 l kvserverpb.Liveness, now time.Time, deadThreshold time.Duration, 136 ) kvserverpb.NodeLivenessStatus { 137 if l.IsDead(now, deadThreshold) { 138 if l.Decommissioning { 139 return kvserverpb.NodeLivenessStatus_DECOMMISSIONED 140 } 141 return kvserverpb.NodeLivenessStatus_DEAD 142 } 143 if l.Decommissioning { 144 return kvserverpb.NodeLivenessStatus_DECOMMISSIONING 145 } 146 if l.Draining { 147 return kvserverpb.NodeLivenessStatus_UNAVAILABLE 148 } 149 if l.IsLive(now) { 150 return kvserverpb.NodeLivenessStatus_LIVE 151 } 152 return kvserverpb.NodeLivenessStatus_UNAVAILABLE 153 } 154 155 type storeDetail struct { 156 desc *roachpb.StoreDescriptor 157 // throttledUntil is when a throttled store can be considered available again 158 // due to a failed or declined snapshot. 159 throttledUntil time.Time 160 // throttledBecause is set to the most recent reason for which a store was 161 // marked as throttled. 162 throttledBecause string 163 // lastUpdatedTime is set when a store is first consulted and every time 164 // gossip arrives for a store. 165 lastUpdatedTime time.Time 166 } 167 168 // isThrottled returns whether the store is currently throttled. 169 func (sd storeDetail) isThrottled(now time.Time) bool { 170 return sd.throttledUntil.After(now) 171 } 172 173 // storeStatus is the current status of a store. 174 type storeStatus int 175 176 // These are the possible values for a storeStatus. 177 const ( 178 _ storeStatus = iota 179 // The store's node is not live or no gossip has been received from 180 // the store for more than the timeUntilStoreDead threshold. 181 storeStatusDead 182 // The store isn't available because it hasn't gossiped yet. This 183 // status lasts until either gossip is received from the store or 184 // the timeUntilStoreDead threshold has passed, at which point its 185 // status will change to dead. 186 storeStatusUnknown 187 // The store is alive but it is throttled. 188 storeStatusThrottled 189 // The store is alive and available. 190 storeStatusAvailable 191 // The store is decommissioning. 192 storeStatusDecommissioning 193 ) 194 195 func (sd *storeDetail) status( 196 now time.Time, threshold time.Duration, nl NodeLivenessFunc, 197 ) storeStatus { 198 // The store is considered dead if it hasn't been updated via gossip 199 // within the liveness threshold. Note that lastUpdatedTime is set 200 // when the store detail is created and will have a non-zero value 201 // even before the first gossip arrives for a store. 202 deadAsOf := sd.lastUpdatedTime.Add(threshold) 203 if now.After(deadAsOf) { 204 return storeStatusDead 205 } 206 // If there's no descriptor (meaning no gossip ever arrived for this 207 // store), return unavailable. 208 if sd.desc == nil { 209 return storeStatusUnknown 210 } 211 212 // Even if the store has been updated via gossip, we still rely on 213 // the node liveness to determine whether it is considered live. 214 switch nl(sd.desc.Node.NodeID, now, threshold) { 215 case kvserverpb.NodeLivenessStatus_DEAD, kvserverpb.NodeLivenessStatus_DECOMMISSIONED: 216 return storeStatusDead 217 case kvserverpb.NodeLivenessStatus_DECOMMISSIONING: 218 return storeStatusDecommissioning 219 case kvserverpb.NodeLivenessStatus_UNKNOWN, kvserverpb.NodeLivenessStatus_UNAVAILABLE: 220 return storeStatusUnknown 221 } 222 223 if sd.isThrottled(now) { 224 return storeStatusThrottled 225 } 226 227 return storeStatusAvailable 228 } 229 230 // localityWithString maintains a string representation of each locality along 231 // with its protocol buffer implementation. This is for the sake of optimizing 232 // memory usage by allocating a single copy of each that can be returned to 233 // callers of getNodeLocalityString rather than each caller (which is currently 234 // each replica in the local store) making its own copy. 235 type localityWithString struct { 236 locality roachpb.Locality 237 str string 238 } 239 240 // StorePool maintains a list of all known stores in the cluster and 241 // information on their health. 242 type StorePool struct { 243 log.AmbientContext 244 st *cluster.Settings 245 246 clock *hlc.Clock 247 gossip *gossip.Gossip 248 nodeCountFn NodeCountFunc 249 nodeLivenessFn NodeLivenessFunc 250 startTime time.Time 251 deterministic bool 252 // We use separate mutexes for storeDetails and nodeLocalities because the 253 // nodeLocalities map is used in the critical code path of Replica.Send() 254 // and we'd rather not block that on something less important accessing 255 // storeDetails. 256 detailsMu struct { 257 syncutil.RWMutex 258 storeDetails map[roachpb.StoreID]*storeDetail 259 } 260 localitiesMu struct { 261 syncutil.RWMutex 262 nodeLocalities map[roachpb.NodeID]localityWithString 263 } 264 } 265 266 // NewStorePool creates a StorePool and registers the store updating callback 267 // with gossip. 268 func NewStorePool( 269 ambient log.AmbientContext, 270 st *cluster.Settings, 271 g *gossip.Gossip, 272 clock *hlc.Clock, 273 nodeCountFn NodeCountFunc, 274 nodeLivenessFn NodeLivenessFunc, 275 deterministic bool, 276 ) *StorePool { 277 sp := &StorePool{ 278 AmbientContext: ambient, 279 st: st, 280 clock: clock, 281 gossip: g, 282 nodeCountFn: nodeCountFn, 283 nodeLivenessFn: nodeLivenessFn, 284 startTime: clock.PhysicalTime(), 285 deterministic: deterministic, 286 } 287 sp.detailsMu.storeDetails = make(map[roachpb.StoreID]*storeDetail) 288 sp.localitiesMu.nodeLocalities = make(map[roachpb.NodeID]localityWithString) 289 290 // Enable redundant callbacks for the store keys because we use these 291 // callbacks as a clock to determine when a store was last updated even if it 292 // hasn't otherwise changed. 293 storeRegex := gossip.MakePrefixPattern(gossip.KeyStorePrefix) 294 g.RegisterCallback(storeRegex, sp.storeGossipUpdate, gossip.Redundant) 295 296 return sp 297 } 298 299 func (sp *StorePool) String() string { 300 sp.detailsMu.RLock() 301 defer sp.detailsMu.RUnlock() 302 303 ids := make(roachpb.StoreIDSlice, 0, len(sp.detailsMu.storeDetails)) 304 for id := range sp.detailsMu.storeDetails { 305 ids = append(ids, id) 306 } 307 sort.Sort(ids) 308 309 var buf bytes.Buffer 310 now := sp.clock.Now().GoTime() 311 timeUntilStoreDead := TimeUntilStoreDead.Get(&sp.st.SV) 312 313 for _, id := range ids { 314 detail := sp.detailsMu.storeDetails[id] 315 fmt.Fprintf(&buf, "%d", id) 316 status := detail.status(now, timeUntilStoreDead, sp.nodeLivenessFn) 317 if status != storeStatusAvailable { 318 fmt.Fprintf(&buf, " (status=%d)", status) 319 } 320 if detail.desc != nil { 321 fmt.Fprintf(&buf, ": range-count=%d fraction-used=%.2f", 322 detail.desc.Capacity.RangeCount, detail.desc.Capacity.FractionUsed()) 323 } 324 throttled := detail.throttledUntil.Sub(now) 325 if throttled > 0 { 326 fmt.Fprintf(&buf, " [throttled=%.1fs]", throttled.Seconds()) 327 } 328 _, _ = buf.WriteString("\n") 329 } 330 return buf.String() 331 } 332 333 // storeGossipUpdate is the gossip callback used to keep the StorePool up to date. 334 func (sp *StorePool) storeGossipUpdate(_ string, content roachpb.Value) { 335 var storeDesc roachpb.StoreDescriptor 336 if err := content.GetProto(&storeDesc); err != nil { 337 ctx := sp.AnnotateCtx(context.TODO()) 338 log.Errorf(ctx, "%v", err) 339 return 340 } 341 342 sp.detailsMu.Lock() 343 detail := sp.getStoreDetailLocked(storeDesc.StoreID) 344 detail.desc = &storeDesc 345 detail.lastUpdatedTime = sp.clock.PhysicalTime() 346 sp.detailsMu.Unlock() 347 348 sp.localitiesMu.Lock() 349 sp.localitiesMu.nodeLocalities[storeDesc.Node.NodeID] = 350 localityWithString{storeDesc.Node.Locality, storeDesc.Node.Locality.String()} 351 sp.localitiesMu.Unlock() 352 } 353 354 // updateLocalStoreAfterRebalance is used to update the local copy of the 355 // target store immediately after a replica addition or removal. 356 func (sp *StorePool) updateLocalStoreAfterRebalance( 357 storeID roachpb.StoreID, rangeUsageInfo RangeUsageInfo, changeType roachpb.ReplicaChangeType, 358 ) { 359 sp.detailsMu.Lock() 360 defer sp.detailsMu.Unlock() 361 detail := *sp.getStoreDetailLocked(storeID) 362 if detail.desc == nil { 363 // We don't have this store yet (this is normal when we're 364 // starting up and don't have full information from the gossip 365 // network). We can't update the local store at this time. 366 return 367 } 368 switch changeType { 369 case roachpb.ADD_REPLICA: 370 detail.desc.Capacity.RangeCount++ 371 detail.desc.Capacity.LogicalBytes += rangeUsageInfo.LogicalBytes 372 detail.desc.Capacity.WritesPerSecond += rangeUsageInfo.WritesPerSecond 373 case roachpb.REMOVE_REPLICA: 374 detail.desc.Capacity.RangeCount-- 375 if detail.desc.Capacity.LogicalBytes <= rangeUsageInfo.LogicalBytes { 376 detail.desc.Capacity.LogicalBytes = 0 377 } else { 378 detail.desc.Capacity.LogicalBytes -= rangeUsageInfo.LogicalBytes 379 } 380 if detail.desc.Capacity.WritesPerSecond <= rangeUsageInfo.WritesPerSecond { 381 detail.desc.Capacity.WritesPerSecond = 0 382 } else { 383 detail.desc.Capacity.WritesPerSecond -= rangeUsageInfo.WritesPerSecond 384 } 385 } 386 sp.detailsMu.storeDetails[storeID] = &detail 387 } 388 389 // updateLocalStoresAfterLeaseTransfer is used to update the local copies of the 390 // involved store descriptors immediately after a lease transfer. 391 func (sp *StorePool) updateLocalStoresAfterLeaseTransfer( 392 from roachpb.StoreID, to roachpb.StoreID, rangeQPS float64, 393 ) { 394 sp.detailsMu.Lock() 395 defer sp.detailsMu.Unlock() 396 397 fromDetail := *sp.getStoreDetailLocked(from) 398 if fromDetail.desc != nil { 399 fromDetail.desc.Capacity.LeaseCount-- 400 if fromDetail.desc.Capacity.QueriesPerSecond < rangeQPS { 401 fromDetail.desc.Capacity.QueriesPerSecond = 0 402 } else { 403 fromDetail.desc.Capacity.QueriesPerSecond -= rangeQPS 404 } 405 sp.detailsMu.storeDetails[from] = &fromDetail 406 } 407 408 toDetail := *sp.getStoreDetailLocked(to) 409 if toDetail.desc != nil { 410 toDetail.desc.Capacity.LeaseCount++ 411 toDetail.desc.Capacity.QueriesPerSecond += rangeQPS 412 sp.detailsMu.storeDetails[to] = &toDetail 413 } 414 } 415 416 // newStoreDetail makes a new storeDetail struct. It sets index to be -1 to 417 // ensure that it will be processed by a queue immediately. 418 func newStoreDetail() *storeDetail { 419 return &storeDetail{} 420 } 421 422 // GetStores returns information on all the stores with descriptor in the pool. 423 // Stores without descriptor (a node that didn't come up yet after a cluster 424 // restart) will not be part of the returned set. 425 func (sp *StorePool) GetStores() map[roachpb.StoreID]roachpb.StoreDescriptor { 426 sp.detailsMu.RLock() 427 defer sp.detailsMu.RUnlock() 428 stores := make(map[roachpb.StoreID]roachpb.StoreDescriptor, len(sp.detailsMu.storeDetails)) 429 for _, s := range sp.detailsMu.storeDetails { 430 if s.desc != nil { 431 stores[s.desc.StoreID] = *s.desc 432 } 433 } 434 return stores 435 } 436 437 // getStoreDetailLocked returns the store detail for the given storeID. 438 // The lock must be held *in write mode* even though this looks like a 439 // read-only method. 440 func (sp *StorePool) getStoreDetailLocked(storeID roachpb.StoreID) *storeDetail { 441 detail, ok := sp.detailsMu.storeDetails[storeID] 442 if !ok { 443 // We don't have this store yet (this is normal when we're 444 // starting up and don't have full information from the gossip 445 // network). The first time this occurs, presume the store is 446 // alive, but start the clock so it will become dead if enough 447 // time passes without updates from gossip. 448 detail = newStoreDetail() 449 detail.lastUpdatedTime = sp.startTime 450 sp.detailsMu.storeDetails[storeID] = detail 451 } 452 return detail 453 } 454 455 // getStoreDescriptor returns the latest store descriptor for the given 456 // storeID. 457 func (sp *StorePool) getStoreDescriptor(storeID roachpb.StoreID) (roachpb.StoreDescriptor, bool) { 458 sp.detailsMu.RLock() 459 defer sp.detailsMu.RUnlock() 460 461 if detail, ok := sp.detailsMu.storeDetails[storeID]; ok && detail.desc != nil { 462 return *detail.desc, true 463 } 464 return roachpb.StoreDescriptor{}, false 465 } 466 467 // decommissioningReplicas filters out replicas on decommissioning node/store 468 // from the provided repls and returns them in a slice. 469 func (sp *StorePool) decommissioningReplicas( 470 repls []roachpb.ReplicaDescriptor, 471 ) (decommissioningReplicas []roachpb.ReplicaDescriptor) { 472 sp.detailsMu.Lock() 473 defer sp.detailsMu.Unlock() 474 475 // NB: We use clock.Now().GoTime() instead of clock.PhysicalTime() is order to 476 // take clock signals from remote nodes into consideration. 477 now := sp.clock.Now().GoTime() 478 timeUntilStoreDead := TimeUntilStoreDead.Get(&sp.st.SV) 479 480 for _, repl := range repls { 481 detail := sp.getStoreDetailLocked(repl.StoreID) 482 switch detail.status(now, timeUntilStoreDead, sp.nodeLivenessFn) { 483 case storeStatusDecommissioning: 484 decommissioningReplicas = append(decommissioningReplicas, repl) 485 } 486 } 487 return 488 } 489 490 // ClusterNodeCount returns the number of nodes that are possible allocation 491 // targets. This includes dead nodes, but not decommissioning or decommissioned 492 // nodes. 493 func (sp *StorePool) ClusterNodeCount() int { 494 return sp.nodeCountFn() 495 } 496 497 // liveAndDeadReplicas divides the provided repls slice into two slices: the 498 // first for live replicas, and the second for dead replicas. 499 // Replicas for which liveness or deadness cannot be ascertained are excluded 500 // from the returned slices. Replicas on decommissioning node/store are 501 // considered live. 502 func (sp *StorePool) liveAndDeadReplicas( 503 repls []roachpb.ReplicaDescriptor, 504 ) (liveReplicas, deadReplicas []roachpb.ReplicaDescriptor) { 505 sp.detailsMu.Lock() 506 defer sp.detailsMu.Unlock() 507 508 now := sp.clock.Now().GoTime() 509 timeUntilStoreDead := TimeUntilStoreDead.Get(&sp.st.SV) 510 511 for _, repl := range repls { 512 detail := sp.getStoreDetailLocked(repl.StoreID) 513 // Mark replica as dead if store is dead. 514 status := detail.status(now, timeUntilStoreDead, sp.nodeLivenessFn) 515 switch status { 516 case storeStatusDead: 517 deadReplicas = append(deadReplicas, repl) 518 case storeStatusAvailable, storeStatusThrottled, storeStatusDecommissioning: 519 // We count both available and throttled stores to be live for the 520 // purpose of computing quorum. 521 // We count decommissioning replicas to be alive because they are readable 522 // and should be used for up-replication if necessary. 523 liveReplicas = append(liveReplicas, repl) 524 case storeStatusUnknown: 525 // No-op. 526 default: 527 log.Fatalf(context.TODO(), "unknown store status %d", status) 528 } 529 } 530 return 531 } 532 533 // stat provides a running sample size and running stats. 534 type stat struct { 535 n, mean float64 536 } 537 538 // Update adds the specified value to the stat, augmenting the running stats. 539 func (s *stat) update(x float64) { 540 s.n++ 541 s.mean += (x - s.mean) / s.n 542 } 543 544 // StoreList holds a list of store descriptors and associated count and used 545 // stats for those stores. 546 type StoreList struct { 547 stores []roachpb.StoreDescriptor 548 549 // candidateRanges tracks range count stats for stores that are eligible to 550 // be rebalance targets (their used capacity percentage must be lower than 551 // maxFractionUsedThreshold). 552 candidateRanges stat 553 554 // candidateLeases tracks range lease stats for stores that are eligible to 555 // be rebalance targets. 556 candidateLeases stat 557 558 // candidateLogicalBytes tracks disk usage stats for stores that are eligible 559 // to be rebalance targets. 560 candidateLogicalBytes stat 561 562 // candidateQueriesPerSecond tracks queries-per-second stats for stores that 563 // are eligible to be rebalance targets. 564 candidateQueriesPerSecond stat 565 566 // candidateWritesPerSecond tracks writes-per-second stats for stores that are 567 // eligible to be rebalance targets. 568 candidateWritesPerSecond stat 569 } 570 571 // Generates a new store list based on the passed in descriptors. It will 572 // maintain the order of those descriptors. 573 func makeStoreList(descriptors []roachpb.StoreDescriptor) StoreList { 574 sl := StoreList{stores: descriptors} 575 for _, desc := range descriptors { 576 if maxCapacityCheck(desc) { 577 sl.candidateRanges.update(float64(desc.Capacity.RangeCount)) 578 } 579 sl.candidateLeases.update(float64(desc.Capacity.LeaseCount)) 580 sl.candidateLogicalBytes.update(float64(desc.Capacity.LogicalBytes)) 581 sl.candidateQueriesPerSecond.update(desc.Capacity.QueriesPerSecond) 582 sl.candidateWritesPerSecond.update(desc.Capacity.WritesPerSecond) 583 } 584 return sl 585 } 586 587 func (sl StoreList) String() string { 588 var buf bytes.Buffer 589 fmt.Fprintf(&buf, 590 " candidate: avg-ranges=%v avg-leases=%v avg-disk-usage=%v avg-queries-per-second=%v", 591 sl.candidateRanges.mean, 592 sl.candidateLeases.mean, 593 humanizeutil.IBytes(int64(sl.candidateLogicalBytes.mean)), 594 sl.candidateQueriesPerSecond.mean) 595 if len(sl.stores) > 0 { 596 fmt.Fprintf(&buf, "\n") 597 } else { 598 fmt.Fprintf(&buf, " <no candidates>") 599 } 600 for _, desc := range sl.stores { 601 fmt.Fprintf(&buf, " %d: ranges=%d leases=%d disk-usage=%s queries-per-second=%.2f\n", 602 desc.StoreID, desc.Capacity.RangeCount, 603 desc.Capacity.LeaseCount, humanizeutil.IBytes(desc.Capacity.LogicalBytes), 604 desc.Capacity.QueriesPerSecond) 605 } 606 return buf.String() 607 } 608 609 // filter takes a store list and filters it using the passed in constraints. It 610 // maintains the original order of the passed in store list. 611 func (sl StoreList) filter(constraints []zonepb.ConstraintsConjunction) StoreList { 612 if len(constraints) == 0 { 613 return sl 614 } 615 var filteredDescs []roachpb.StoreDescriptor 616 for _, store := range sl.stores { 617 if ok := constraintsCheck(store, constraints); ok { 618 filteredDescs = append(filteredDescs, store) 619 } 620 } 621 return makeStoreList(filteredDescs) 622 } 623 624 type storeFilter int 625 626 const ( 627 _ storeFilter = iota 628 // storeFilterNone requests that the storeList include all live stores. Dead, 629 // unknown, and corrupted stores are always excluded from the storeList. 630 storeFilterNone 631 // storeFilterThrottled requests that the returned store list additionally 632 // exclude stores that have been throttled for declining a snapshot. (See 633 // storePool.throttle for details.) Throttled stores should not be considered 634 // for replica rebalancing, for example, but can still be considered for lease 635 // rebalancing. 636 storeFilterThrottled 637 ) 638 639 type throttledStoreReasons []string 640 641 // getStoreList returns a storeList that contains all active stores that contain 642 // the required attributes and their associated stats. The storeList is filtered 643 // according to the provided storeFilter. It also returns the total number of 644 // alive and throttled stores. 645 func (sp *StorePool) getStoreList(filter storeFilter) (StoreList, int, throttledStoreReasons) { 646 sp.detailsMu.RLock() 647 defer sp.detailsMu.RUnlock() 648 649 var storeIDs roachpb.StoreIDSlice 650 for storeID := range sp.detailsMu.storeDetails { 651 storeIDs = append(storeIDs, storeID) 652 } 653 return sp.getStoreListFromIDsRLocked(storeIDs, filter) 654 } 655 656 // getStoreListFromIDs is the same function as getStoreList but only returns stores 657 // from the subset of passed in store IDs. 658 func (sp *StorePool) getStoreListFromIDs( 659 storeIDs roachpb.StoreIDSlice, filter storeFilter, 660 ) (StoreList, int, throttledStoreReasons) { 661 sp.detailsMu.RLock() 662 defer sp.detailsMu.RUnlock() 663 return sp.getStoreListFromIDsRLocked(storeIDs, filter) 664 } 665 666 // getStoreListFromIDsRLocked is the same function as getStoreList but requires 667 // that the detailsMU read lock is held. 668 func (sp *StorePool) getStoreListFromIDsRLocked( 669 storeIDs roachpb.StoreIDSlice, filter storeFilter, 670 ) (StoreList, int, throttledStoreReasons) { 671 if sp.deterministic { 672 sort.Sort(storeIDs) 673 } else { 674 shuffle.Shuffle(storeIDs) 675 } 676 677 var aliveStoreCount int 678 var throttled throttledStoreReasons 679 var storeDescriptors []roachpb.StoreDescriptor 680 681 now := sp.clock.Now().GoTime() 682 timeUntilStoreDead := TimeUntilStoreDead.Get(&sp.st.SV) 683 684 for _, storeID := range storeIDs { 685 detail, ok := sp.detailsMu.storeDetails[storeID] 686 if !ok { 687 // Do nothing; this store is not in the StorePool. 688 continue 689 } 690 switch s := detail.status(now, timeUntilStoreDead, sp.nodeLivenessFn); s { 691 case storeStatusThrottled: 692 aliveStoreCount++ 693 throttled = append(throttled, detail.throttledBecause) 694 if filter != storeFilterThrottled { 695 storeDescriptors = append(storeDescriptors, *detail.desc) 696 } 697 case storeStatusAvailable: 698 aliveStoreCount++ 699 storeDescriptors = append(storeDescriptors, *detail.desc) 700 case storeStatusDead, storeStatusUnknown, storeStatusDecommissioning: 701 // Do nothing; this store cannot be used. 702 default: 703 panic(fmt.Sprintf("unknown store status: %d", s)) 704 } 705 } 706 return makeStoreList(storeDescriptors), aliveStoreCount, throttled 707 } 708 709 type throttleReason int 710 711 const ( 712 _ throttleReason = iota 713 throttleDeclined 714 throttleFailed 715 ) 716 717 // throttle informs the store pool that the given remote store declined a 718 // snapshot or failed to apply one, ensuring that it will not be considered 719 // for up-replication or rebalancing until after the configured timeout period 720 // has elapsed. Declined being true indicates that the remote store explicitly 721 // declined a snapshot. 722 func (sp *StorePool) throttle(reason throttleReason, why string, storeID roachpb.StoreID) { 723 sp.detailsMu.Lock() 724 defer sp.detailsMu.Unlock() 725 detail := sp.getStoreDetailLocked(storeID) 726 detail.throttledBecause = why 727 728 // If a snapshot is declined, be it due to an error or because it was 729 // rejected, we mark the store detail as having been declined so it won't 730 // be considered as a candidate for new replicas until after the configured 731 // timeout period has passed. 732 switch reason { 733 case throttleDeclined: 734 timeout := DeclinedReservationsTimeout.Get(&sp.st.SV) 735 detail.throttledUntil = sp.clock.PhysicalTime().Add(timeout) 736 if log.V(2) { 737 ctx := sp.AnnotateCtx(context.TODO()) 738 log.Infof(ctx, "snapshot declined (%s), s%d will be throttled for %s until %s", 739 why, storeID, timeout, detail.throttledUntil) 740 } 741 case throttleFailed: 742 timeout := FailedReservationsTimeout.Get(&sp.st.SV) 743 detail.throttledUntil = sp.clock.PhysicalTime().Add(timeout) 744 if log.V(2) { 745 ctx := sp.AnnotateCtx(context.TODO()) 746 log.Infof(ctx, "snapshot failed (%s), s%d will be throttled for %s until %s", 747 why, storeID, timeout, detail.throttledUntil) 748 } 749 } 750 } 751 752 // getLocalities returns the localities for the provided replicas. 753 // TODO(bram): consider storing a full list of all node to node diversity 754 // scores for faster lookups. 755 func (sp *StorePool) getLocalities( 756 replicas []roachpb.ReplicaDescriptor, 757 ) map[roachpb.NodeID]roachpb.Locality { 758 sp.localitiesMu.RLock() 759 defer sp.localitiesMu.RUnlock() 760 localities := make(map[roachpb.NodeID]roachpb.Locality) 761 for _, replica := range replicas { 762 if locality, ok := sp.localitiesMu.nodeLocalities[replica.NodeID]; ok { 763 localities[replica.NodeID] = locality.locality 764 } else { 765 localities[replica.NodeID] = roachpb.Locality{} 766 } 767 } 768 return localities 769 } 770 771 // getNodeLocalityString returns the locality information for the given node 772 // in its string format. 773 func (sp *StorePool) getNodeLocalityString(nodeID roachpb.NodeID) string { 774 sp.localitiesMu.RLock() 775 defer sp.localitiesMu.RUnlock() 776 locality, ok := sp.localitiesMu.nodeLocalities[nodeID] 777 if !ok { 778 return "" 779 } 780 return locality.str 781 }