github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/store.go (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "bytes" 15 "context" 16 "fmt" 17 "math" 18 "os" 19 "path/filepath" 20 "runtime" 21 "sort" 22 "strings" 23 "sync" 24 "sync/atomic" 25 "time" 26 "unsafe" 27 28 "github.com/cockroachdb/cockroach/pkg/base" 29 "github.com/cockroachdb/cockroach/pkg/clusterversion" 30 "github.com/cockroachdb/cockroach/pkg/config" 31 "github.com/cockroachdb/cockroach/pkg/config/zonepb" 32 "github.com/cockroachdb/cockroach/pkg/gossip" 33 "github.com/cockroachdb/cockroach/pkg/keys" 34 "github.com/cockroachdb/cockroach/pkg/kv" 35 "github.com/cockroachdb/cockroach/pkg/kv/kvbase" 36 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval" 37 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/container" 38 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/ctpb" 39 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/compactor" 40 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/idalloc" 41 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/intentresolver" 42 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts" 43 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/raftentry" 44 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/tscache" 45 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/txnrecovery" 46 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/txnwait" 47 "github.com/cockroachdb/cockroach/pkg/roachpb" 48 "github.com/cockroachdb/cockroach/pkg/rpc" 49 "github.com/cockroachdb/cockroach/pkg/rpc/nodedialer" 50 "github.com/cockroachdb/cockroach/pkg/settings" 51 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 52 "github.com/cockroachdb/cockroach/pkg/sql/sqlutil" 53 "github.com/cockroachdb/cockroach/pkg/storage" 54 "github.com/cockroachdb/cockroach/pkg/storage/cloud" 55 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 56 "github.com/cockroachdb/cockroach/pkg/util/contextutil" 57 "github.com/cockroachdb/cockroach/pkg/util/envutil" 58 "github.com/cockroachdb/cockroach/pkg/util/hlc" 59 "github.com/cockroachdb/cockroach/pkg/util/limit" 60 "github.com/cockroachdb/cockroach/pkg/util/log" 61 "github.com/cockroachdb/cockroach/pkg/util/metric" 62 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 63 "github.com/cockroachdb/cockroach/pkg/util/quotapool" 64 "github.com/cockroachdb/cockroach/pkg/util/retry" 65 "github.com/cockroachdb/cockroach/pkg/util/shuffle" 66 "github.com/cockroachdb/cockroach/pkg/util/stop" 67 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 68 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 69 "github.com/cockroachdb/cockroach/pkg/util/tracing" 70 "github.com/cockroachdb/cockroach/pkg/util/uuid" 71 "github.com/cockroachdb/errors" 72 "github.com/cockroachdb/logtags" 73 "github.com/google/btree" 74 "go.etcd.io/etcd/raft" 75 "golang.org/x/time/rate" 76 ) 77 78 const ( 79 // rangeIDAllocCount is the number of Range IDs to allocate per allocation. 80 rangeIDAllocCount = 10 81 defaultRaftHeartbeatIntervalTicks = 5 82 83 // defaultRaftEntryCacheSize is the default size in bytes for a 84 // store's Raft log entry cache. 85 defaultRaftEntryCacheSize = 1 << 24 // 16M 86 87 // replicaRequestQueueSize specifies the maximum number of requests to queue 88 // for a replica. 89 replicaRequestQueueSize = 100 90 91 defaultGossipWhenCapacityDeltaExceedsFraction = 0.01 92 93 // systemDataGossipInterval is the interval at which range lease 94 // holders verify that the most recent system data is gossiped. 95 // This ensures that system data is always eventually gossiped, even 96 // if a range lease holder experiences a failure causing a missed 97 // gossip update. 98 systemDataGossipInterval = 1 * time.Minute 99 ) 100 101 var storeSchedulerConcurrency = envutil.EnvOrDefaultInt( 102 "COCKROACH_SCHEDULER_CONCURRENCY", 8*runtime.NumCPU()) 103 104 var logSSTInfoTicks = envutil.EnvOrDefaultInt( 105 "COCKROACH_LOG_SST_INFO_TICKS_INTERVAL", 60, 106 ) 107 108 // bulkIOWriteLimit is defined here because it is used by BulkIOWriteLimiter. 109 var bulkIOWriteLimit = settings.RegisterPublicByteSizeSetting( 110 "kv.bulk_io_write.max_rate", 111 "the rate limit (bytes/sec) to use for writes to disk on behalf of bulk io ops", 112 1<<40, 113 ) 114 115 // importRequestsLimit limits concurrent import requests. 116 var importRequestsLimit = settings.RegisterPositiveIntSetting( 117 "kv.bulk_io_write.concurrent_import_requests", 118 "number of import requests a store will handle concurrently before queuing", 119 1, 120 ) 121 122 // addSSTableRequestLimit limits concurrent AddSSTable requests. 123 var addSSTableRequestLimit = settings.RegisterPositiveIntSetting( 124 "kv.bulk_io_write.concurrent_addsstable_requests", 125 "number of AddSSTable requests a store will handle concurrently before queuing", 126 1, 127 ) 128 129 // concurrentRangefeedItersLimit limits concurrent rangefeed catchup iterators. 130 var concurrentRangefeedItersLimit = settings.RegisterPositiveIntSetting( 131 "kv.rangefeed.concurrent_catchup_iterators", 132 "number of rangefeeds catchup iterators a store will allow concurrently before queueing", 133 64, 134 ) 135 136 // raftLeadershipTransferTimeout limits the amount of time a drain command 137 // waits for lease transfers. 138 var raftLeadershipTransferWait = func() *settings.DurationSetting { 139 s := settings.RegisterValidatedDurationSetting( 140 raftLeadershipTransferWaitKey, 141 "the amount of time a server waits to transfer range leases before proceeding with the rest of the shutdown process", 142 5*time.Second, 143 func(v time.Duration) error { 144 if v < 0 { 145 return errors.Errorf("cannot set %s to a negative duration: %s", 146 raftLeadershipTransferWaitKey, v) 147 } 148 return nil 149 }, 150 ) 151 s.SetVisibility(settings.Public) 152 return s 153 }() 154 155 const raftLeadershipTransferWaitKey = "server.shutdown.lease_transfer_wait" 156 157 // ExportRequestsLimit is the number of Export requests that can run at once. 158 // Each extracts data from RocksDB to a temp file and then uploads it to cloud 159 // storage. In order to not exhaust the disk or memory, or saturate the network, 160 // limit the number of these that can be run in parallel. This number was chosen 161 // by a guessing - it could be improved by more measured heuristics. Exported 162 // here since we check it in in the caller to limit generated requests as well 163 // to prevent excessive queuing. 164 var ExportRequestsLimit = settings.RegisterPositiveIntSetting( 165 "kv.bulk_io_write.concurrent_export_requests", 166 "number of export requests a store will handle concurrently before queuing", 167 3, 168 ) 169 170 // TestStoreConfig has some fields initialized with values relevant in tests. 171 func TestStoreConfig(clock *hlc.Clock) StoreConfig { 172 if clock == nil { 173 clock = hlc.NewClock(hlc.UnixNano, time.Nanosecond) 174 } 175 st := cluster.MakeTestingClusterSettings() 176 sc := StoreConfig{ 177 DefaultZoneConfig: zonepb.DefaultZoneConfigRef(), 178 DefaultSystemZoneConfig: zonepb.DefaultSystemZoneConfigRef(), 179 Settings: st, 180 AmbientCtx: log.AmbientContext{Tracer: st.Tracer}, 181 Clock: clock, 182 CoalescedHeartbeatsInterval: 50 * time.Millisecond, 183 RaftHeartbeatIntervalTicks: 1, 184 ScanInterval: 10 * time.Minute, 185 HistogramWindowInterval: metric.TestSampleInterval, 186 EnableEpochRangeLeases: true, 187 ClosedTimestamp: container.NoopContainer(), 188 ProtectedTimestampCache: protectedts.EmptyCache(clock), 189 } 190 191 // Use shorter Raft tick settings in order to minimize start up and failover 192 // time in tests. 193 sc.RaftElectionTimeoutTicks = 3 194 sc.RaftTickInterval = 100 * time.Millisecond 195 sc.SetDefaults() 196 return sc 197 } 198 199 func newRaftConfig( 200 strg raft.Storage, id uint64, appliedIndex uint64, storeCfg StoreConfig, logger raft.Logger, 201 ) *raft.Config { 202 return &raft.Config{ 203 ID: id, 204 Applied: appliedIndex, 205 ElectionTick: storeCfg.RaftElectionTimeoutTicks, 206 HeartbeatTick: storeCfg.RaftHeartbeatIntervalTicks, 207 MaxUncommittedEntriesSize: storeCfg.RaftMaxUncommittedEntriesSize, 208 MaxCommittedSizePerReady: storeCfg.RaftMaxCommittedSizePerReady, 209 MaxSizePerMsg: storeCfg.RaftMaxSizePerMsg, 210 MaxInflightMsgs: storeCfg.RaftMaxInflightMsgs, 211 Storage: strg, 212 Logger: logger, 213 214 PreVote: true, 215 } 216 } 217 218 // verifyKeys verifies keys. If checkEndKey is true, then the end key 219 // is verified to be non-nil and greater than start key. If 220 // checkEndKey is false, end key is verified to be nil. Additionally, 221 // verifies that start key is less than KeyMax and end key is less 222 // than or equal to KeyMax. It also verifies that a key range that 223 // contains range-local keys is completely range-local. 224 func verifyKeys(start, end roachpb.Key, checkEndKey bool) error { 225 if bytes.Compare(start, roachpb.KeyMax) >= 0 { 226 return errors.Errorf("start key %q must be less than KeyMax", start) 227 } 228 if !checkEndKey { 229 if len(end) != 0 { 230 return errors.Errorf("end key %q should not be specified for this operation", end) 231 } 232 return nil 233 } 234 if end == nil { 235 return errors.Errorf("end key must be specified") 236 } 237 if bytes.Compare(roachpb.KeyMax, end) < 0 { 238 return errors.Errorf("end key %q must be less than or equal to KeyMax", end) 239 } 240 { 241 sAddr, err := keys.Addr(start) 242 if err != nil { 243 return err 244 } 245 eAddr, err := keys.Addr(end) 246 if err != nil { 247 return err 248 } 249 if !sAddr.Less(eAddr) { 250 return errors.Errorf("end key %q must be greater than start %q", end, start) 251 } 252 if !bytes.Equal(sAddr, start) { 253 if bytes.Equal(eAddr, end) { 254 return errors.Errorf("start key is range-local, but end key is not") 255 } 256 } else if bytes.Compare(start, keys.LocalMax) < 0 { 257 // It's a range op, not local but somehow plows through local data - 258 // not cool. 259 return errors.Errorf("start key in [%q,%q) must be greater than LocalMax", start, end) 260 } 261 } 262 263 return nil 264 } 265 266 // rangeKeyItem is a common interface for roachpb.Key and Range. 267 type rangeKeyItem interface { 268 startKey() roachpb.RKey 269 } 270 271 // rangeBTreeKey is a type alias of roachpb.RKey that implements the 272 // rangeKeyItem interface and the btree.Item interface. 273 type rangeBTreeKey roachpb.RKey 274 275 var _ rangeKeyItem = rangeBTreeKey{} 276 277 func (k rangeBTreeKey) startKey() roachpb.RKey { 278 return (roachpb.RKey)(k) 279 } 280 281 var _ btree.Item = rangeBTreeKey{} 282 283 func (k rangeBTreeKey) Less(i btree.Item) bool { 284 return k.startKey().Less(i.(rangeKeyItem).startKey()) 285 } 286 287 // A NotBootstrappedError indicates that an engine has not yet been 288 // bootstrapped due to a store identifier not being present. 289 type NotBootstrappedError struct{} 290 291 // Error formats error. 292 func (e *NotBootstrappedError) Error() string { 293 return "store has not been bootstrapped" 294 } 295 296 // A storeReplicaVisitor calls a visitor function for each of a store's 297 // initialized Replicas (in unspecified order). It provides an option 298 // to visit replicas in increasing RangeID order. 299 type storeReplicaVisitor struct { 300 store *Store 301 repls []*Replica // Replicas to be visited 302 ordered bool // Option to visit replicas in sorted order 303 visited int // Number of visited ranges, -1 before first call to Visit() 304 } 305 306 // Len implements sort.Interface. 307 func (rs storeReplicaVisitor) Len() int { return len(rs.repls) } 308 309 // Less implements sort.Interface. 310 func (rs storeReplicaVisitor) Less(i, j int) bool { return rs.repls[i].RangeID < rs.repls[j].RangeID } 311 312 // Swap implements sort.Interface. 313 func (rs storeReplicaVisitor) Swap(i, j int) { rs.repls[i], rs.repls[j] = rs.repls[j], rs.repls[i] } 314 315 // newStoreReplicaVisitor constructs a storeReplicaVisitor. 316 func newStoreReplicaVisitor(store *Store) *storeReplicaVisitor { 317 return &storeReplicaVisitor{ 318 store: store, 319 visited: -1, 320 } 321 } 322 323 // InOrder tells the visitor to visit replicas in increasing RangeID order. 324 func (rs *storeReplicaVisitor) InOrder() *storeReplicaVisitor { 325 rs.ordered = true 326 return rs 327 } 328 329 // Visit calls the visitor with each Replica until false is returned. 330 func (rs *storeReplicaVisitor) Visit(visitor func(*Replica) bool) { 331 // Copy the range IDs to a slice so that we iterate over some (possibly 332 // stale) view of all Replicas without holding the Store lock. In particular, 333 // no locks are acquired during the copy process. 334 rs.repls = nil 335 rs.store.mu.replicas.Range(func(k int64, v unsafe.Pointer) bool { 336 rs.repls = append(rs.repls, (*Replica)(v)) 337 return true 338 }) 339 340 if rs.ordered { 341 // If the replicas were requested in sorted order, perform the sort. 342 sort.Sort(rs) 343 } else { 344 // The Replicas are already in "unspecified order" due to map iteration, 345 // but we want to make sure it's completely random to prevent issues in 346 // tests where stores are scanning replicas in lock-step and one store is 347 // winning the race and getting a first crack at processing the replicas on 348 // its queues. 349 // 350 // TODO(peter): Re-evaluate whether this is necessary after we allow 351 // rebalancing away from the leaseholder. See TestRebalance_3To5Small. 352 shuffle.Shuffle(rs) 353 } 354 355 rs.visited = 0 356 for _, repl := range rs.repls { 357 // TODO(tschottdorf): let the visitor figure out if something's been 358 // destroyed once we return errors from mutexes (#9190). After all, it 359 // can still happen with this code. 360 rs.visited++ 361 repl.mu.RLock() 362 destroyed := repl.mu.destroyStatus 363 initialized := repl.isInitializedRLocked() 364 repl.mu.RUnlock() 365 if initialized && destroyed.IsAlive() && !visitor(repl) { 366 break 367 } 368 } 369 rs.visited = 0 370 } 371 372 // EstimatedCount returns an estimated count of the underlying store's 373 // replicas. 374 // 375 // TODO(tschottdorf): this method has highly doubtful semantics. 376 func (rs *storeReplicaVisitor) EstimatedCount() int { 377 if rs.visited <= 0 { 378 return rs.store.ReplicaCount() 379 } 380 return len(rs.repls) - rs.visited 381 } 382 383 // A Store maintains a map of ranges by start key. A Store corresponds 384 // to one physical device. 385 type Store struct { 386 Ident *roachpb.StoreIdent // pointer to catch access before Start() is called 387 cfg StoreConfig 388 db *kv.DB 389 engine storage.Engine // The underlying key-value store 390 compactor *compactor.Compactor // Schedules compaction of the engine 391 tsCache tscache.Cache // Most recent timestamps for keys / key ranges 392 allocator Allocator // Makes allocation decisions 393 replRankings *replicaRankings 394 storeRebalancer *StoreRebalancer 395 rangeIDAlloc *idalloc.Allocator // Range ID allocator 396 gcQueue *gcQueue // Garbage collection queue 397 mergeQueue *mergeQueue // Range merging queue 398 splitQueue *splitQueue // Range splitting queue 399 replicateQueue *replicateQueue // Replication queue 400 replicaGCQueue *replicaGCQueue // Replica GC queue 401 raftLogQueue *raftLogQueue // Raft log truncation queue 402 raftSnapshotQueue *raftSnapshotQueue // Raft repair queue 403 tsMaintenanceQueue *timeSeriesMaintenanceQueue // Time series maintenance queue 404 scanner *replicaScanner // Replica scanner 405 consistencyQueue *consistencyQueue // Replica consistency check queue 406 metrics *StoreMetrics 407 intentResolver *intentresolver.IntentResolver 408 recoveryMgr txnrecovery.Manager 409 raftEntryCache *raftentry.Cache 410 limiters batcheval.Limiters 411 txnWaitMetrics *txnwait.Metrics 412 sstSnapshotStorage SSTSnapshotStorage 413 protectedtsCache protectedts.Cache 414 415 // gossipRangeCountdown and leaseRangeCountdown are countdowns of 416 // changes to range and leaseholder counts, after which the store 417 // descriptor will be re-gossiped earlier than the normal periodic 418 // gossip interval. Updated atomically. 419 gossipRangeCountdown int32 420 gossipLeaseCountdown int32 421 // gossipQueriesPerSecondVal and gossipWritesPerSecond serve similar 422 // purposes, but simply record the most recently gossiped value so that we 423 // can tell if a newly measured value differs by enough to justify 424 // re-gossiping the store. 425 gossipQueriesPerSecondVal syncutil.AtomicFloat64 426 gossipWritesPerSecondVal syncutil.AtomicFloat64 427 428 coalescedMu struct { 429 syncutil.Mutex 430 heartbeats map[roachpb.StoreIdent][]RaftHeartbeat 431 heartbeatResponses map[roachpb.StoreIdent][]RaftHeartbeat 432 } 433 // 1 if the store was started, 0 if it wasn't. To be accessed using atomic 434 // ops. 435 started int32 436 stopper *stop.Stopper 437 // The time when the store was Start()ed, in nanos. 438 startedAt int64 439 nodeDesc *roachpb.NodeDescriptor 440 initComplete sync.WaitGroup // Signaled by async init tasks 441 442 // Semaphore to limit concurrent non-empty snapshot application. 443 snapshotApplySem chan struct{} 444 445 // Track newly-acquired expiration-based leases that we want to proactively 446 // renew. An object is sent on the signal whenever a new entry is added to 447 // the map. 448 renewableLeases syncutil.IntMap // map[roachpb.RangeID]*Replica 449 renewableLeasesSignal chan struct{} 450 451 // draining holds a bool which indicates whether this store is draining. See 452 // SetDraining() for a more detailed explanation of behavior changes. 453 // 454 // TODO(bdarnell,tschottdorf): Would look better inside of `mu`, which at 455 // the time of its creation was riddled with deadlock (but that situation 456 // has likely improved). 457 draining atomic.Value 458 459 // Locking notes: To avoid deadlocks, the following lock order must be 460 // obeyed: baseQueue.mu < Replica.raftMu < Replica.readOnlyCmdMu < Store.mu 461 // < Replica.mu < Replica.unreachablesMu < Store.coalescedMu < Store.scheduler.mu. 462 // (It is not required to acquire every lock in sequence, but when multiple 463 // locks are held at the same time, it is incorrect to acquire a lock with 464 // "lesser" value in this sequence after one with "greater" value). 465 // 466 // Methods of Store with a "Locked" suffix require that 467 // Store.mu.Mutex be held. Other locking requirements are indicated 468 // in comments. 469 // 470 // The locking structure here is complex because A) Store is a 471 // container of Replicas, so it must generally be consulted before 472 // doing anything with any Replica, B) some Replica operations 473 // (including splits) modify the Store. Therefore we generally lock 474 // Store.mu to find a Replica, release it, then call a method on the 475 // Replica. These short-lived locks of Store.mu and Replica.mu are 476 // often surrounded by a long-lived lock of Replica.raftMu as 477 // described below. 478 // 479 // There are two major entry points to this stack of locks: 480 // Store.Send (which handles incoming RPCs) and raft-related message 481 // processing (including handleRaftReady on the processRaft 482 // goroutine and HandleRaftRequest on GRPC goroutines). Reads are 483 // processed solely through Store.Send; writes start out on 484 // Store.Send until they propose their raft command and then they 485 // finish on the raft goroutines. 486 // 487 // TODO(bdarnell): a Replica could be destroyed immediately after 488 // Store.Send finds the Replica and releases the lock. We need 489 // another RWMutex to be held by anything using a Replica to ensure 490 // that everything is finished before releasing it. #7169 491 // 492 // Detailed description of the locks: 493 // 494 // * Replica.raftMu: Held while any raft messages are being processed 495 // (including handleRaftReady and HandleRaftRequest) or while the set of 496 // Replicas in the Store is being changed (which may happen outside of raft 497 // via the replica GC queue). 498 // 499 // If holding raftMus for multiple different replicas simultaneously, 500 // acquire the locks in the order that the replicas appear in replicasByKey. 501 // 502 // * Replica.readOnlyCmdMu (RWMutex): Held in read mode while any 503 // read-only command is in progress on the replica; held in write 504 // mode while executing a commit trigger. This is necessary 505 // because read-only commands mutate the Replica's timestamp cache 506 // (while holding Replica.mu in addition to readOnlyCmdMu). The 507 // RWMutex ensures that no reads are being executed during a split 508 // (which copies the timestamp cache) while still allowing 509 // multiple reads in parallel (#3148). TODO(bdarnell): this lock 510 // only needs to be held during splitTrigger, not all triggers. 511 // 512 // * baseQueue.mu: The mutex contained in each of the store's queues (such 513 // as the replicate queue, replica GC queue, GC queue, ...). The mutex is 514 // typically acquired when deciding whether to add a replica to the respective 515 // queue. 516 // 517 // * Store.mu: Protects the Store's map of its Replicas. Acquired and 518 // released briefly at the start of each request; metadata operations like 519 // splits acquire it again to update the map. Even though these lock 520 // acquisitions do not make up a single critical section, it is safe thanks 521 // to Replica.raftMu which prevents any concurrent modifications. 522 // 523 // * Replica.mu: Protects the Replica's in-memory state. Acquired 524 // and released briefly as needed (note that while the lock is 525 // held "briefly" in that it is not held for an entire request, we 526 // do sometimes do I/O while holding the lock, as in 527 // Replica.Entries). This lock should be held when calling any 528 // methods on the raft group. Raft may call back into the Replica 529 // via the methods of the raft.Storage interface, which assume the 530 // lock is held even though they do not follow our convention of 531 // the "Locked" suffix. 532 // 533 // * Store.scheduler.mu: Protects the Raft scheduler internal 534 // state. Callbacks from the scheduler are performed while not holding this 535 // mutex in order to observe the above ordering constraints. 536 // 537 // Splits and merges deserve special consideration: they operate on two 538 // ranges. For splits, this might seem fine because the right-hand range is 539 // brand new, but an uninitialized version may have been created by a raft 540 // message before we process the split (see commentary on 541 // Replica.splitTrigger). We make this safe, for both splits and merges, by 542 // locking the right-hand range for the duration of the Raft command 543 // containing the split/merge trigger. 544 // 545 // Note that because we acquire and release Store.mu and Replica.mu 546 // repeatedly rather than holding a lock for an entire request, we are 547 // actually relying on higher-level locks to ensure that things don't change 548 // out from under us. In particular, handleRaftReady accesses the replicaID 549 // more than once, and we rely on Replica.raftMu to ensure that this is not 550 // modified by a concurrent HandleRaftRequest. (#4476) 551 552 mu struct { 553 syncutil.RWMutex 554 // Map of replicas by Range ID (map[roachpb.RangeID]*Replica). This 555 // includes `uninitReplicas`. May be read without holding Store.mu. 556 replicas syncutil.IntMap 557 // A btree key containing objects of type *Replica or *ReplicaPlaceholder. 558 // Both types have an associated key range; the btree is keyed on their 559 // start keys. 560 replicasByKey *btree.BTree 561 uninitReplicas map[roachpb.RangeID]*Replica // Map of uninitialized replicas by Range ID 562 // replicaPlaceholders is a map to access all placeholders, so they can 563 // be directly accessed and cleared after stepping all raft groups. This 564 // is always in sync with the placeholders in replicasByKey. 565 replicaPlaceholders map[roachpb.RangeID]*ReplicaPlaceholder 566 } 567 568 // The unquiesced subset of replicas. 569 unquiescedReplicas struct { 570 syncutil.Mutex 571 m map[roachpb.RangeID]struct{} 572 } 573 574 // The subset of replicas with active rangefeeds. 575 rangefeedReplicas struct { 576 syncutil.Mutex 577 m map[roachpb.RangeID]struct{} 578 } 579 580 // replicaQueues is a map of per-Replica incoming request queues. These 581 // queues might more naturally belong in Replica, but are kept separate to 582 // avoid reworking the locking in getOrCreateReplica which requires 583 // Replica.raftMu to be held while a replica is being inserted into 584 // Store.mu.replicas. 585 replicaQueues syncutil.IntMap // map[roachpb.RangeID]*raftRequestQueue 586 587 scheduler *raftScheduler 588 589 // livenessMap is a map from nodeID to a bool indicating 590 // liveness. It is updated periodically in raftTickLoop(). 591 livenessMap atomic.Value 592 593 // cachedCapacity caches information on store capacity to prevent 594 // expensive recomputations in case leases or replicas are rapidly 595 // rebalancing. 596 cachedCapacity struct { 597 syncutil.Mutex 598 roachpb.StoreCapacity 599 } 600 601 counts struct { 602 // Number of placeholders removed due to error. 603 removedPlaceholders int32 604 // Number of placeholders successfully filled by a snapshot. 605 filledPlaceholders int32 606 // Number of placeholders removed due to a snapshot that was dropped by 607 // raft. 608 droppedPlaceholders int32 609 } 610 611 computeInitialMetrics sync.Once 612 } 613 614 var _ kv.Sender = &Store{} 615 616 // A StoreConfig encompasses the auxiliary objects and configuration 617 // required to create a store. 618 // All fields holding a pointer or an interface are required to create 619 // a store; the rest will have sane defaults set if omitted. 620 type StoreConfig struct { 621 AmbientCtx log.AmbientContext 622 base.RaftConfig 623 624 DefaultZoneConfig *zonepb.ZoneConfig 625 DefaultSystemZoneConfig *zonepb.ZoneConfig 626 Settings *cluster.Settings 627 Clock *hlc.Clock 628 DB *kv.DB 629 Gossip *gossip.Gossip 630 NodeLiveness *NodeLiveness 631 StorePool *StorePool 632 Transport *RaftTransport 633 NodeDialer *nodedialer.Dialer 634 RPCContext *rpc.Context 635 RangeDescriptorCache kvbase.RangeDescriptorCache 636 637 ClosedTimestamp *container.Container 638 639 // SQLExecutor is used by the store to execute SQL statements. 640 SQLExecutor sqlutil.InternalExecutor 641 642 // TimeSeriesDataStore is an interface used by the store's time series 643 // maintenance queue to dispatch individual maintenance tasks. 644 TimeSeriesDataStore TimeSeriesDataStore 645 646 // CoalescedHeartbeatsInterval is the interval for which heartbeat messages 647 // are queued and then sent as a single coalesced heartbeat; it is a 648 // fraction of the RaftTickInterval so that heartbeats don't get delayed by 649 // an entire tick. Delaying coalescing heartbeat responses has a bad 650 // interaction with quiescence because the coalesced (delayed) heartbeat 651 // response can unquiesce the leader. Consider: 652 // 653 // T+0: leader queues MsgHeartbeat 654 // T+1: leader sends MsgHeartbeat 655 // follower receives MsgHeartbeat 656 // follower queues MsgHeartbeatResp 657 // T+2: leader queues quiesce message 658 // follower sends MsgHeartbeatResp 659 // leader receives MsgHeartbeatResp 660 // T+3: leader sends quiesce message 661 // 662 // Thus we want to make sure that heartbeats are responded to faster than 663 // the quiesce cadence. 664 CoalescedHeartbeatsInterval time.Duration 665 666 // RaftHeartbeatIntervalTicks is the number of ticks that pass between heartbeats. 667 RaftHeartbeatIntervalTicks int 668 669 // ScanInterval is the default value for the scan interval 670 ScanInterval time.Duration 671 672 // ScanMinIdleTime is the minimum time the scanner will be idle between ranges. 673 // If enabled (> 0), the scanner may complete in more than ScanInterval for 674 // stores with many ranges. 675 ScanMinIdleTime time.Duration 676 677 // ScanMaxIdleTime is the maximum time the scanner will be idle between ranges. 678 // If enabled (> 0), the scanner may complete in less than ScanInterval for small 679 // stores. 680 ScanMaxIdleTime time.Duration 681 682 // If LogRangeEvents is true, major changes to ranges will be logged into 683 // the range event log. 684 LogRangeEvents bool 685 686 // RaftEntryCacheSize is the size in bytes of the Raft log entry cache 687 // shared by all Raft groups managed by the store. 688 RaftEntryCacheSize uint64 689 690 // IntentResolverTaskLimit is the maximum number of asynchronous tasks that 691 // may be started by the intent resolver. -1 indicates no asynchronous tasks 692 // are allowed. 0 uses the default value (defaultIntentResolverTaskLimit) 693 // which is non-zero. 694 IntentResolverTaskLimit int 695 696 TestingKnobs StoreTestingKnobs 697 698 // concurrentSnapshotApplyLimit specifies the maximum number of empty 699 // snapshots and the maximum number of non-empty snapshots that are permitted 700 // to be applied concurrently. 701 concurrentSnapshotApplyLimit int 702 703 // HistogramWindowInterval is (server.Config).HistogramWindowInterval 704 HistogramWindowInterval time.Duration 705 706 // EnableEpochRangeLeases controls whether epoch-based range leases are used. 707 EnableEpochRangeLeases bool 708 709 // GossipWhenCapacityDeltaExceedsFraction specifies the fraction from the last 710 // gossiped store capacity values which need be exceeded before the store will 711 // gossip immediately without waiting for the periodic gossip interval. 712 GossipWhenCapacityDeltaExceedsFraction float64 713 714 // ExternalStorage creates ExternalStorage objects which allows access to external files 715 ExternalStorage cloud.ExternalStorageFactory 716 ExternalStorageFromURI cloud.ExternalStorageFromURIFactory 717 718 // ProtectedTimestampCache maintains the state of the protected timestamp 719 // subsystem. It is queried during the GC process and in the handling of 720 // AdminVerifyProtectedTimestampRequest. 721 ProtectedTimestampCache protectedts.Cache 722 } 723 724 // ConsistencyTestingKnobs is a BatchEvalTestingKnobs struct used to control the 725 // behavior of the consistency checker for tests. 726 type ConsistencyTestingKnobs struct { 727 // If non-nil, OnBadChecksumFatal is called by CheckConsistency() (instead of 728 // calling log.Fatal) on a checksum mismatch. 729 OnBadChecksumFatal func(roachpb.StoreIdent) 730 // If non-nil, BadChecksumReportDiff is called by CheckConsistency() on a 731 // checksum mismatch to report the diff between snapshots. 732 BadChecksumReportDiff func(roachpb.StoreIdent, ReplicaSnapshotDiffSlice) 733 ConsistencyQueueResultHook func(response roachpb.CheckConsistencyResponse) 734 } 735 736 // Valid returns true if the StoreConfig is populated correctly. 737 // We don't check for Gossip and DB since some of our tests pass 738 // that as nil. 739 func (sc *StoreConfig) Valid() bool { 740 return sc.Clock != nil && sc.Transport != nil && 741 sc.RaftTickInterval != 0 && sc.RaftHeartbeatIntervalTicks > 0 && 742 sc.RaftElectionTimeoutTicks > 0 && sc.ScanInterval >= 0 && 743 sc.AmbientCtx.Tracer != nil 744 } 745 746 // SetDefaults initializes unset fields in StoreConfig to values 747 // suitable for use on a local network. 748 // TODO(tschottdorf): see if this ought to be configurable via flags. 749 func (sc *StoreConfig) SetDefaults() { 750 sc.RaftConfig.SetDefaults() 751 752 if sc.CoalescedHeartbeatsInterval == 0 { 753 sc.CoalescedHeartbeatsInterval = sc.RaftTickInterval / 2 754 } 755 if sc.RaftHeartbeatIntervalTicks == 0 { 756 sc.RaftHeartbeatIntervalTicks = defaultRaftHeartbeatIntervalTicks 757 } 758 if sc.RaftEntryCacheSize == 0 { 759 sc.RaftEntryCacheSize = defaultRaftEntryCacheSize 760 } 761 if sc.concurrentSnapshotApplyLimit == 0 { 762 // NB: setting this value higher than 1 is likely to degrade client 763 // throughput. 764 sc.concurrentSnapshotApplyLimit = 765 envutil.EnvOrDefaultInt("COCKROACH_CONCURRENT_SNAPSHOT_APPLY_LIMIT", 1) 766 } 767 768 if sc.GossipWhenCapacityDeltaExceedsFraction == 0 { 769 sc.GossipWhenCapacityDeltaExceedsFraction = defaultGossipWhenCapacityDeltaExceedsFraction 770 } 771 } 772 773 // LeaseExpiration returns an int64 to increment a manual clock with to 774 // make sure that all active range leases expire. 775 func (sc *StoreConfig) LeaseExpiration() int64 { 776 // Due to lease extensions, the remaining interval can be longer than just 777 // the sum of the offset (=length of stasis period) and the active 778 // duration, but definitely not by 2x. 779 maxOffset := sc.Clock.MaxOffset() 780 return 2 * (sc.RangeLeaseActiveDuration() + maxOffset).Nanoseconds() 781 } 782 783 // NewStore returns a new instance of a store. 784 func NewStore( 785 ctx context.Context, cfg StoreConfig, eng storage.Engine, nodeDesc *roachpb.NodeDescriptor, 786 ) *Store { 787 // TODO(tschottdorf): find better place to set these defaults. 788 cfg.SetDefaults() 789 790 if !cfg.Valid() { 791 log.Fatalf(ctx, "invalid store configuration: %+v", &cfg) 792 } 793 s := &Store{ 794 cfg: cfg, 795 db: cfg.DB, // TODO(tschottdorf): remove redundancy. 796 engine: eng, 797 nodeDesc: nodeDesc, 798 metrics: newStoreMetrics(cfg.HistogramWindowInterval), 799 } 800 if cfg.RPCContext != nil { 801 s.allocator = MakeAllocator(cfg.StorePool, cfg.RPCContext.RemoteClocks.Latency) 802 } else { 803 s.allocator = MakeAllocator(cfg.StorePool, func(string) (time.Duration, bool) { 804 return 0, false 805 }) 806 } 807 s.replRankings = newReplicaRankings() 808 809 s.draining.Store(false) 810 s.scheduler = newRaftScheduler(s.metrics, s, storeSchedulerConcurrency) 811 812 s.raftEntryCache = raftentry.NewCache(cfg.RaftEntryCacheSize) 813 s.metrics.registry.AddMetricStruct(s.raftEntryCache.Metrics()) 814 815 s.coalescedMu.Lock() 816 s.coalescedMu.heartbeats = map[roachpb.StoreIdent][]RaftHeartbeat{} 817 s.coalescedMu.heartbeatResponses = map[roachpb.StoreIdent][]RaftHeartbeat{} 818 s.coalescedMu.Unlock() 819 820 s.mu.Lock() 821 s.mu.replicaPlaceholders = map[roachpb.RangeID]*ReplicaPlaceholder{} 822 s.mu.replicasByKey = btree.New(64 /* degree */) 823 s.mu.uninitReplicas = map[roachpb.RangeID]*Replica{} 824 s.mu.Unlock() 825 826 s.unquiescedReplicas.Lock() 827 s.unquiescedReplicas.m = map[roachpb.RangeID]struct{}{} 828 s.unquiescedReplicas.Unlock() 829 830 s.rangefeedReplicas.Lock() 831 s.rangefeedReplicas.m = map[roachpb.RangeID]struct{}{} 832 s.rangefeedReplicas.Unlock() 833 834 s.tsCache = tscache.New(cfg.Clock) 835 s.metrics.registry.AddMetricStruct(s.tsCache.Metrics()) 836 837 s.txnWaitMetrics = txnwait.NewMetrics(cfg.HistogramWindowInterval) 838 s.metrics.registry.AddMetricStruct(s.txnWaitMetrics) 839 840 s.compactor = compactor.NewCompactor( 841 s.cfg.Settings, 842 s.engine, 843 func() (roachpb.StoreCapacity, error) { 844 return s.Capacity(false /* useCached */) 845 }, 846 func(ctx context.Context) { 847 s.asyncGossipStore(ctx, "compactor-initiated rocksdb compaction", false /* useCached */) 848 }, 849 ) 850 s.metrics.registry.AddMetricStruct(s.compactor.Metrics) 851 852 s.snapshotApplySem = make(chan struct{}, cfg.concurrentSnapshotApplyLimit) 853 854 s.renewableLeasesSignal = make(chan struct{}) 855 856 s.limiters.BulkIOWriteRate = rate.NewLimiter(rate.Limit(bulkIOWriteLimit.Get(&cfg.Settings.SV)), bulkIOWriteBurst) 857 bulkIOWriteLimit.SetOnChange(&cfg.Settings.SV, func() { 858 s.limiters.BulkIOWriteRate.SetLimit(rate.Limit(bulkIOWriteLimit.Get(&cfg.Settings.SV))) 859 }) 860 s.limiters.ConcurrentImportRequests = limit.MakeConcurrentRequestLimiter( 861 "importRequestLimiter", int(importRequestsLimit.Get(&cfg.Settings.SV)), 862 ) 863 importRequestsLimit.SetOnChange(&cfg.Settings.SV, func() { 864 s.limiters.ConcurrentImportRequests.SetLimit(int(importRequestsLimit.Get(&cfg.Settings.SV))) 865 }) 866 s.limiters.ConcurrentExportRequests = limit.MakeConcurrentRequestLimiter( 867 "exportRequestLimiter", int(ExportRequestsLimit.Get(&cfg.Settings.SV)), 868 ) 869 870 // The snapshot storage is usually empty at this point since it is cleared 871 // after each snapshot application, except when the node crashed right before 872 // it can clean it up. If this fails it's not a correctness issue since the 873 // storage is also cleared before receiving a snapshot. 874 s.sstSnapshotStorage = NewSSTSnapshotStorage(s.engine, s.limiters.BulkIOWriteRate) 875 if err := s.sstSnapshotStorage.Clear(); err != nil { 876 log.Warningf(ctx, "failed to clear snapshot storage: %v", err) 877 } 878 s.protectedtsCache = cfg.ProtectedTimestampCache 879 880 // On low-CPU instances, a default limit value may still allow ExportRequests 881 // to tie up all cores so cap limiter at cores-1 when setting value is higher. 882 exportCores := runtime.NumCPU() - 1 883 if exportCores < 1 { 884 exportCores = 1 885 } 886 ExportRequestsLimit.SetOnChange(&cfg.Settings.SV, func() { 887 limit := int(ExportRequestsLimit.Get(&cfg.Settings.SV)) 888 if limit > exportCores { 889 limit = exportCores 890 } 891 s.limiters.ConcurrentExportRequests.SetLimit(limit) 892 }) 893 s.limiters.ConcurrentAddSSTableRequests = limit.MakeConcurrentRequestLimiter( 894 "addSSTableRequestLimiter", int(addSSTableRequestLimit.Get(&cfg.Settings.SV)), 895 ) 896 addSSTableRequestLimit.SetOnChange(&cfg.Settings.SV, func() { 897 s.limiters.ConcurrentAddSSTableRequests.SetLimit(int(addSSTableRequestLimit.Get(&cfg.Settings.SV))) 898 }) 899 s.limiters.ConcurrentRangefeedIters = limit.MakeConcurrentRequestLimiter( 900 "rangefeedIterLimiter", int(concurrentRangefeedItersLimit.Get(&cfg.Settings.SV)), 901 ) 902 concurrentRangefeedItersLimit.SetOnChange(&cfg.Settings.SV, func() { 903 s.limiters.ConcurrentRangefeedIters.SetLimit( 904 int(concurrentRangefeedItersLimit.Get(&cfg.Settings.SV))) 905 }) 906 907 if s.cfg.Gossip != nil { 908 // Add range scanner and configure with queues. 909 s.scanner = newReplicaScanner( 910 s.cfg.AmbientCtx, s.cfg.Clock, cfg.ScanInterval, 911 cfg.ScanMinIdleTime, cfg.ScanMaxIdleTime, newStoreReplicaVisitor(s), 912 ) 913 s.gcQueue = newGCQueue(s, s.cfg.Gossip) 914 s.mergeQueue = newMergeQueue(s, s.db, s.cfg.Gossip) 915 s.splitQueue = newSplitQueue(s, s.db, s.cfg.Gossip) 916 s.replicateQueue = newReplicateQueue(s, s.cfg.Gossip, s.allocator) 917 s.replicaGCQueue = newReplicaGCQueue(s, s.db, s.cfg.Gossip) 918 s.raftLogQueue = newRaftLogQueue(s, s.db, s.cfg.Gossip) 919 s.raftSnapshotQueue = newRaftSnapshotQueue(s, s.cfg.Gossip) 920 s.consistencyQueue = newConsistencyQueue(s, s.cfg.Gossip) 921 // NOTE: If more queue types are added, please also add them to the list of 922 // queues on the EnqueueRange debug page as defined in 923 // pkg/ui/src/views/reports/containers/enqueueRange/index.tsx 924 s.scanner.AddQueues( 925 s.gcQueue, s.mergeQueue, s.splitQueue, s.replicateQueue, s.replicaGCQueue, 926 s.raftLogQueue, s.raftSnapshotQueue, s.consistencyQueue) 927 928 if s.cfg.TimeSeriesDataStore != nil { 929 s.tsMaintenanceQueue = newTimeSeriesMaintenanceQueue( 930 s, s.db, s.cfg.Gossip, s.cfg.TimeSeriesDataStore, 931 ) 932 s.scanner.AddQueues(s.tsMaintenanceQueue) 933 } 934 } 935 936 if cfg.TestingKnobs.DisableGCQueue { 937 s.setGCQueueActive(false) 938 } 939 if cfg.TestingKnobs.DisableMergeQueue { 940 s.setMergeQueueActive(false) 941 } 942 if cfg.TestingKnobs.DisableRaftLogQueue { 943 s.setRaftLogQueueActive(false) 944 } 945 if cfg.TestingKnobs.DisableReplicaGCQueue { 946 s.setReplicaGCQueueActive(false) 947 } 948 if cfg.TestingKnobs.DisableReplicateQueue { 949 s.SetReplicateQueueActive(false) 950 } 951 if cfg.TestingKnobs.DisableSplitQueue { 952 s.setSplitQueueActive(false) 953 } 954 if cfg.TestingKnobs.DisableTimeSeriesMaintenanceQueue { 955 s.setTimeSeriesMaintenanceQueueActive(false) 956 } 957 if cfg.TestingKnobs.DisableRaftSnapshotQueue { 958 s.setRaftSnapshotQueueActive(false) 959 } 960 if cfg.TestingKnobs.DisableConsistencyQueue { 961 s.setConsistencyQueueActive(false) 962 } 963 if cfg.TestingKnobs.DisableScanner { 964 s.setScannerActive(false) 965 } 966 967 return s 968 } 969 970 // String formats a store for debug output. 971 func (s *Store) String() string { 972 return fmt.Sprintf("[n%d,s%d]", s.Ident.NodeID, s.Ident.StoreID) 973 } 974 975 // ClusterSettings returns the node's ClusterSettings. 976 func (s *Store) ClusterSettings() *cluster.Settings { 977 return s.cfg.Settings 978 } 979 980 // AnnotateCtx is a convenience wrapper; see AmbientContext. 981 func (s *Store) AnnotateCtx(ctx context.Context) context.Context { 982 return s.cfg.AmbientCtx.AnnotateCtx(ctx) 983 } 984 985 // SetDraining (when called with 'true') causes incoming lease transfers to be 986 // rejected, prevents all of the Store's Replicas from acquiring or extending 987 // range leases, and attempts to transfer away any leases owned. 988 // When called with 'false', returns to the normal mode of operation. 989 // 990 // The reporter callback, if non-nil, is called on a best effort basis 991 // to report work that needed to be done and which may or may not have 992 // been done by the time this call returns. See the explanation in 993 // pkg/server/drain.go for details. 994 func (s *Store) SetDraining(drain bool, reporter func(int, string)) { 995 s.draining.Store(drain) 996 if !drain { 997 newStoreReplicaVisitor(s).Visit(func(r *Replica) bool { 998 r.mu.Lock() 999 r.mu.draining = false 1000 r.mu.Unlock() 1001 return true 1002 }) 1003 return 1004 } 1005 1006 baseCtx := logtags.AddTag(context.Background(), "drain", nil) 1007 1008 // In a running server, the code below (transferAllAway and the loop 1009 // that calls it) does not need to be conditional on messaging by 1010 // the Stopper. This is because the top level Server calls SetDrain 1011 // upon a graceful shutdown, and waits until the SetDrain calls 1012 // completes, at which point the work has terminated on its own. If 1013 // the top-level server is forcefully shut down, it does not matter 1014 // if some of the code below is still running. 1015 // 1016 // However, the situation is different in unit tests where we also 1017 // assert there are no leaking goroutines when a test terminates. 1018 // If a test terminates with a timed out lease transfer, it's 1019 // possible for the transferAllAway() closure to be still running 1020 // when the closer shuts down the test server. 1021 // 1022 // To prevent this, we add this code here which adds the missing 1023 // cancel + wait in the particular case where the stopper is 1024 // completing a shutdown while a graceful SetDrain is still ongoing. 1025 ctx, cancelFn := s.stopper.WithCancelOnStop(baseCtx) 1026 defer cancelFn() 1027 1028 var wg sync.WaitGroup 1029 1030 transferAllAway := func(transferCtx context.Context) int { 1031 // Limit the number of concurrent lease transfers. 1032 const leaseTransferConcurrency = 100 1033 sem := quotapool.NewIntPool("Store.SetDraining", leaseTransferConcurrency) 1034 1035 // Incremented for every lease or Raft leadership transfer 1036 // attempted. We try to send both the lease and the Raft leaders 1037 // away, but this may not reliably work. Instead, we run the 1038 // surrounding retry loop until there are no leaders/leases left 1039 // (ignoring single-replica or uninitialized Raft groups). 1040 var numTransfersAttempted int32 1041 newStoreReplicaVisitor(s).Visit(func(r *Replica) bool { 1042 // 1043 // We need to be careful about the case where the ctx has been canceled 1044 // prior to the call to (*Stopper).RunLimitedAsyncTask(). In that case, 1045 // the goroutine is not even spawned. However, we don't want to 1046 // mis-count the missing goroutine as the lack of transfer attempted. 1047 // So what we do here is immediately increase numTransfersAttempted 1048 // to count this replica, and then decrease it when it is known 1049 // below that there is nothing to transfer (not lease holder and 1050 // not raft leader). 1051 atomic.AddInt32(&numTransfersAttempted, 1) 1052 wg.Add(1) 1053 if err := s.stopper.RunLimitedAsyncTask( 1054 r.AnnotateCtx(ctx), "storage.Store: draining replica", sem, true, /* wait */ 1055 func(ctx context.Context) { 1056 defer wg.Done() 1057 1058 select { 1059 case <-transferCtx.Done(): 1060 // Context canceled: the timeout loop has decided we've 1061 // done enough draining 1062 // (server.shutdown.lease_transfer_wait). 1063 // 1064 // We need this check here because each call of 1065 // transferAllAway() traverses all stores/replicas without 1066 // checking for the timeout otherwise. 1067 if log.V(1) { 1068 log.Infof(ctx, "lease transfer aborted due to exceeded timeout") 1069 } 1070 return 1071 default: 1072 } 1073 1074 r.mu.Lock() 1075 r.mu.draining = true 1076 status := r.raftStatusRLocked() 1077 // needsRaftTransfer is true when we can reasonably hope to transfer 1078 // this replica's lease and/or Raft leadership away. 1079 needsRaftTransfer := status != nil && 1080 len(status.Progress) > 1 && 1081 !(status.RaftState == raft.StateFollower && status.Lead != 0) 1082 r.mu.Unlock() 1083 1084 var drainingLease roachpb.Lease 1085 for { 1086 var llHandle *leaseRequestHandle 1087 r.mu.Lock() 1088 lease, nextLease := r.getLeaseRLocked() 1089 if nextLease != (roachpb.Lease{}) && nextLease.OwnedBy(s.StoreID()) { 1090 llHandle = r.mu.pendingLeaseRequest.JoinRequest() 1091 } 1092 r.mu.Unlock() 1093 1094 if llHandle != nil { 1095 <-llHandle.C() 1096 continue 1097 } 1098 drainingLease = lease 1099 break 1100 } 1101 1102 // Learner replicas aren't allowed to become the leaseholder or raft 1103 // leader, so only consider the `Voters` replicas. 1104 needsLeaseTransfer := len(r.Desc().Replicas().Voters()) > 1 && 1105 drainingLease.OwnedBy(s.StoreID()) && 1106 r.IsLeaseValid(drainingLease, s.Clock().Now()) 1107 1108 if !needsLeaseTransfer && !needsRaftTransfer { 1109 if log.V(1) { 1110 // This logging is useful to troubleshoot incomplete drains. 1111 log.Info(ctx, "not moving out") 1112 } 1113 atomic.AddInt32(&numTransfersAttempted, -1) 1114 return 1115 } 1116 if log.V(1) { 1117 // This logging is useful to troubleshoot incomplete drains. 1118 log.Infof(ctx, "trying to move replica out: lease transfer = %v, raft transfer = %v", needsLeaseTransfer, needsRaftTransfer) 1119 } 1120 1121 if needsLeaseTransfer { 1122 desc, zone := r.DescAndZone() 1123 leaseTransferred, err := s.replicateQueue.findTargetAndTransferLease( 1124 ctx, 1125 r, 1126 desc, 1127 zone, 1128 transferLeaseOptions{}, 1129 ) 1130 if log.V(1) && !leaseTransferred { 1131 // Note that a nil error means that there were no suitable 1132 // candidates. 1133 log.Errorf( 1134 ctx, 1135 "did not transfer lease %s for replica %s when draining: %v", 1136 drainingLease, 1137 desc, 1138 err, 1139 ) 1140 } 1141 if err == nil && leaseTransferred { 1142 // If we just transferred the lease away, Raft leadership will 1143 // usually transfer with it. Invoking a separate Raft leadership 1144 // transfer would only obstruct this. 1145 needsRaftTransfer = false 1146 } 1147 } 1148 1149 if needsRaftTransfer { 1150 r.raftMu.Lock() 1151 r.maybeTransferRaftLeadership(ctx) 1152 r.raftMu.Unlock() 1153 } 1154 }); err != nil { 1155 if log.V(1) { 1156 log.Errorf(ctx, "error running draining task: %+v", err) 1157 } 1158 wg.Done() 1159 return false 1160 } 1161 return true 1162 }) 1163 wg.Wait() 1164 return int(numTransfersAttempted) 1165 } 1166 1167 // Give all replicas at least one chance to transfer. 1168 // If we don't do that, then it's possible that a configured 1169 // value for raftLeadershipTransferWait is too low to iterate 1170 // through all the replicas at least once, and the drain 1171 // condition on the remaining value will never be reached. 1172 if numRemaining := transferAllAway(ctx); numRemaining > 0 { 1173 // Report progress to the Drain RPC. 1174 if reporter != nil { 1175 reporter(numRemaining, "range lease iterations") 1176 } 1177 } else { 1178 // No more work to do. 1179 return 1180 } 1181 1182 // We've seen all the replicas once. Now we're going to iterate 1183 // until they're all gone, up to the configured timeout. 1184 transferTimeout := raftLeadershipTransferWait.Get(&s.cfg.Settings.SV) 1185 1186 if err := contextutil.RunWithTimeout(ctx, "wait for raft leadership transfer", transferTimeout, 1187 func(ctx context.Context) error { 1188 opts := retry.Options{ 1189 InitialBackoff: 10 * time.Millisecond, 1190 MaxBackoff: time.Second, 1191 Multiplier: 2, 1192 } 1193 everySecond := log.Every(time.Second) 1194 var err error 1195 // Avoid retry.ForDuration because of https://github.com/cockroachdb/cockroach/issues/25091. 1196 for r := retry.StartWithCtx(ctx, opts); r.Next(); { 1197 err = nil 1198 if numRemaining := transferAllAway(ctx); numRemaining > 0 { 1199 // Report progress to the Drain RPC. 1200 if reporter != nil { 1201 reporter(numRemaining, "range lease iterations") 1202 } 1203 err = errors.Errorf("waiting for %d replicas to transfer their lease away", numRemaining) 1204 if everySecond.ShouldLog() { 1205 log.Infof(ctx, "%v", err) 1206 } 1207 } 1208 if err == nil { 1209 // All leases transferred. We can stop retrying. 1210 break 1211 } 1212 } 1213 // If there's an error in the context but not yet detected in 1214 // err, take it into account here. 1215 return errors.CombineErrors(err, ctx.Err()) 1216 }); err != nil { 1217 // You expect this message when shutting down a server in an unhealthy 1218 // cluster. If we see it on healthy ones, there's likely something to fix. 1219 log.Warningf(ctx, "unable to drain cleanly within %s, service might briefly deteriorate: %+v", transferTimeout, err) 1220 } 1221 } 1222 1223 // IsStarted returns true if the Store has been started. 1224 func (s *Store) IsStarted() bool { 1225 return atomic.LoadInt32(&s.started) == 1 1226 } 1227 1228 // IterateIDPrefixKeys helps visit system keys that use RangeID prefixing (such 1229 // as RaftHardStateKey, RangeTombstoneKey, and many others). Such keys could in 1230 // principle exist at any RangeID, and this helper efficiently discovers all the 1231 // keys of the desired type (as specified by the supplied `keyFn`) and, for each 1232 // key-value pair discovered, unmarshals it into `msg` and then invokes `f`. 1233 // 1234 // Iteration stops on the first error (and will pass through that error). 1235 func IterateIDPrefixKeys( 1236 ctx context.Context, 1237 reader storage.Reader, 1238 keyFn func(roachpb.RangeID) roachpb.Key, 1239 msg protoutil.Message, 1240 f func(_ roachpb.RangeID) (more bool, _ error), 1241 ) error { 1242 rangeID := roachpb.RangeID(1) 1243 iter := reader.NewIterator(storage.IterOptions{ 1244 UpperBound: keys.LocalRangeIDPrefix.PrefixEnd().AsRawKey(), 1245 }) 1246 defer iter.Close() 1247 1248 for { 1249 bumped := false 1250 mvccKey := storage.MakeMVCCMetadataKey(keyFn(rangeID)) 1251 iter.SeekGE(mvccKey) 1252 1253 if ok, err := iter.Valid(); !ok { 1254 return err 1255 } 1256 1257 unsafeKey := iter.UnsafeKey() 1258 1259 if !bytes.HasPrefix(unsafeKey.Key, keys.LocalRangeIDPrefix) { 1260 // Left the local keyspace, so we're done. 1261 return nil 1262 } 1263 1264 curRangeID, _, _, _, err := keys.DecodeRangeIDKey(unsafeKey.Key) 1265 if err != nil { 1266 return err 1267 } 1268 1269 if curRangeID > rangeID { 1270 // `bumped` is always `false` here, but let's be explicit. 1271 if !bumped { 1272 rangeID = curRangeID 1273 bumped = true 1274 } 1275 mvccKey = storage.MakeMVCCMetadataKey(keyFn(rangeID)) 1276 } 1277 1278 if !unsafeKey.Key.Equal(mvccKey.Key) { 1279 if !bumped { 1280 // Don't increment the rangeID if it has already been incremented 1281 // above, or we could skip past a value we ought to see. 1282 rangeID++ 1283 bumped = true // for completeness' sake; continuing below anyway 1284 } 1285 continue 1286 } 1287 1288 ok, err := storage.MVCCGetProto( 1289 ctx, reader, unsafeKey.Key, hlc.Timestamp{}, msg, storage.MVCCGetOptions{}) 1290 if err != nil { 1291 return err 1292 } 1293 if !ok { 1294 return errors.Errorf("unable to unmarshal %s into %T", unsafeKey.Key, msg) 1295 } 1296 1297 more, err := f(rangeID) 1298 if !more || err != nil { 1299 return err 1300 } 1301 rangeID++ 1302 } 1303 } 1304 1305 // IterateRangeDescriptors calls the provided function with each descriptor 1306 // from the provided Engine. The return values of this method and fn have 1307 // semantics similar to engine.MVCCIterate. 1308 func IterateRangeDescriptors( 1309 ctx context.Context, 1310 reader storage.Reader, 1311 fn func(desc roachpb.RangeDescriptor) (done bool, err error), 1312 ) error { 1313 log.Event(ctx, "beginning range descriptor iteration") 1314 // Iterator over all range-local key-based data. 1315 start := keys.RangeDescriptorKey(roachpb.RKeyMin) 1316 end := keys.RangeDescriptorKey(roachpb.RKeyMax) 1317 1318 allCount := 0 1319 matchCount := 0 1320 bySuffix := make(map[string]int) 1321 kvToDesc := func(kv roachpb.KeyValue) (bool, error) { 1322 allCount++ 1323 // Only consider range metadata entries; ignore others. 1324 _, suffix, _, err := keys.DecodeRangeKey(kv.Key) 1325 if err != nil { 1326 return false, err 1327 } 1328 bySuffix[string(suffix)]++ 1329 if !bytes.Equal(suffix, keys.LocalRangeDescriptorSuffix) { 1330 return false, nil 1331 } 1332 var desc roachpb.RangeDescriptor 1333 if err := kv.Value.GetProto(&desc); err != nil { 1334 return false, err 1335 } 1336 matchCount++ 1337 return fn(desc) 1338 } 1339 1340 _, err := storage.MVCCIterate(ctx, reader, start, end, hlc.MaxTimestamp, 1341 storage.MVCCScanOptions{Inconsistent: true}, kvToDesc) 1342 log.Eventf(ctx, "iterated over %d keys to find %d range descriptors (by suffix: %v)", 1343 allCount, matchCount, bySuffix) 1344 return err 1345 } 1346 1347 // ReadStoreIdent reads the StoreIdent from the store. 1348 // It returns *NotBootstrappedError if the ident is missing (meaning that the 1349 // store needs to be bootstrapped). 1350 func ReadStoreIdent(ctx context.Context, eng storage.Engine) (roachpb.StoreIdent, error) { 1351 var ident roachpb.StoreIdent 1352 ok, err := storage.MVCCGetProto( 1353 ctx, eng, keys.StoreIdentKey(), hlc.Timestamp{}, &ident, storage.MVCCGetOptions{}) 1354 if err != nil { 1355 return roachpb.StoreIdent{}, err 1356 } else if !ok { 1357 return roachpb.StoreIdent{}, &NotBootstrappedError{} 1358 } 1359 return ident, err 1360 } 1361 1362 // Start the engine, set the GC and read the StoreIdent. 1363 func (s *Store) Start(ctx context.Context, stopper *stop.Stopper) error { 1364 s.stopper = stopper 1365 1366 // Populate the store ident. If not bootstrapped, ReadStoreIntent will 1367 // return an error. 1368 ident, err := ReadStoreIdent(ctx, s.engine) 1369 if err != nil { 1370 return err 1371 } 1372 s.Ident = &ident 1373 1374 // Set the store ID for logging. 1375 s.cfg.AmbientCtx.AddLogTag("s", s.StoreID()) 1376 ctx = s.AnnotateCtx(ctx) 1377 log.Event(ctx, "read store identity") 1378 1379 // Add the store ID to the scanner's AmbientContext before starting it, since 1380 // the AmbientContext provided during construction did not include it. 1381 // Note that this is just a hacky way of getting around that without 1382 // refactoring the scanner/queue construction/start logic more broadly, and 1383 // depends on the scanner not having added its own log tag. 1384 if s.scanner != nil { 1385 s.scanner.AmbientContext.AddLogTag("s", s.StoreID()) 1386 } 1387 1388 // If the nodeID is 0, it has not be assigned yet. 1389 if s.nodeDesc.NodeID != 0 && s.Ident.NodeID != s.nodeDesc.NodeID { 1390 return errors.Errorf("node id:%d does not equal the one in node descriptor:%d", s.Ident.NodeID, s.nodeDesc.NodeID) 1391 } 1392 // Always set gossip NodeID before gossiping any info. 1393 if s.cfg.Gossip != nil { 1394 s.cfg.Gossip.NodeID.Set(ctx, s.Ident.NodeID) 1395 } 1396 1397 // Create ID allocators. 1398 idAlloc, err := idalloc.NewAllocator(idalloc.Options{ 1399 AmbientCtx: s.cfg.AmbientCtx, 1400 Key: keys.RangeIDGenerator, 1401 Incrementer: idalloc.DBIncrementer(s.db), 1402 BlockSize: rangeIDAllocCount, 1403 Stopper: s.stopper, 1404 }) 1405 if err != nil { 1406 return err 1407 } 1408 1409 // Create the intent resolver. 1410 s.intentResolver = intentresolver.New(intentresolver.Config{ 1411 Clock: s.cfg.Clock, 1412 DB: s.db, 1413 Stopper: stopper, 1414 TaskLimit: s.cfg.IntentResolverTaskLimit, 1415 AmbientCtx: s.cfg.AmbientCtx, 1416 TestingKnobs: s.cfg.TestingKnobs.IntentResolverKnobs, 1417 RangeDescriptorCache: s.cfg.RangeDescriptorCache, 1418 }) 1419 s.metrics.registry.AddMetricStruct(s.intentResolver.Metrics) 1420 1421 // Create the recovery manager. 1422 s.recoveryMgr = txnrecovery.NewManager( 1423 s.cfg.AmbientCtx, s.cfg.Clock, s.db, stopper, 1424 ) 1425 s.metrics.registry.AddMetricStruct(s.recoveryMgr.Metrics()) 1426 1427 s.rangeIDAlloc = idAlloc 1428 1429 now := s.cfg.Clock.Now() 1430 s.startedAt = now.WallTime 1431 1432 // Iterate over all range descriptors, ignoring uncommitted versions 1433 // (consistent=false). Uncommitted intents which have been abandoned 1434 // due to a split crashing halfway will simply be resolved on the 1435 // next split attempt. They can otherwise be ignored. 1436 1437 // TODO(peter): While we have to iterate to find the replica descriptors 1438 // serially, we can perform the migrations and replica creation 1439 // concurrently. Note that while we can perform this initialization 1440 // concurrently, all of the initialization must be performed before we start 1441 // listening for Raft messages and starting the process Raft loop. 1442 err = IterateRangeDescriptors(ctx, s.engine, 1443 func(desc roachpb.RangeDescriptor) (bool, error) { 1444 if !desc.IsInitialized() { 1445 return false, errors.Errorf("found uninitialized RangeDescriptor: %+v", desc) 1446 } 1447 replicaDesc, found := desc.GetReplicaDescriptor(s.StoreID()) 1448 if !found { 1449 // This is a pre-emptive snapshot. It's also possible that this is a 1450 // range which has processed a raft command to remove itself (which is 1451 // possible prior to 19.2 or if the DisableEagerReplicaRemoval is 1452 // enabled) and has not yet been removed by the replica gc queue. 1453 // We treat both cases the same way. These should no longer exist in 1454 // 20.2 or after as there was a migration in 20.1 to remove them and 1455 // no pre-emptive snapshot should have been sent since 19.2 was 1456 // finalized. 1457 return false /* done */, errors.AssertionFailedf( 1458 "found RangeDescriptor for range %d at generation %d which does not"+ 1459 " contain this store %d", 1460 log.Safe(desc.RangeID), 1461 log.Safe(desc.Generation), 1462 log.Safe(s.StoreID())) 1463 } 1464 1465 rep, err := newReplica(ctx, &desc, s, replicaDesc.ReplicaID) 1466 if err != nil { 1467 return false, err 1468 } 1469 1470 // We can't lock s.mu across NewReplica due to the lock ordering 1471 // constraint (*Replica).raftMu < (*Store).mu. See the comment on 1472 // (Store).mu. 1473 s.mu.Lock() 1474 err = s.addReplicaInternalLocked(rep) 1475 s.mu.Unlock() 1476 if err != nil { 1477 return false, err 1478 } 1479 1480 // Add this range and its stats to our counter. 1481 s.metrics.ReplicaCount.Inc(1) 1482 s.metrics.addMVCCStats(rep.GetMVCCStats()) 1483 1484 if _, ok := desc.GetReplicaDescriptor(s.StoreID()); !ok { 1485 // We are no longer a member of the range, but we didn't GC the replica 1486 // before shutting down. Add the replica to the GC queue. 1487 s.replicaGCQueue.AddAsync(ctx, rep, replicaGCPriorityRemoved) 1488 } 1489 1490 // Note that we do not create raft groups at this time; they will be created 1491 // on-demand the first time they are needed. This helps reduce the amount of 1492 // election-related traffic in a cold start. 1493 // Raft initialization occurs when we propose a command on this range or 1494 // receive a raft message addressed to it. 1495 // TODO(bdarnell): Also initialize raft groups when read leases are needed. 1496 // TODO(bdarnell): Scan all ranges at startup for unapplied log entries 1497 // and initialize those groups. 1498 return false, nil 1499 }) 1500 if err != nil { 1501 return err 1502 } 1503 1504 // Start Raft processing goroutines. 1505 s.cfg.Transport.Listen(s.StoreID(), s) 1506 s.processRaft(ctx) 1507 1508 // Register a callback to unquiesce any ranges with replicas on a 1509 // node transitioning from non-live to live. 1510 if s.cfg.NodeLiveness != nil { 1511 s.cfg.NodeLiveness.RegisterCallback(s.nodeIsLiveCallback) 1512 } 1513 1514 // Gossip is only ever nil while bootstrapping a cluster and 1515 // in unittests. 1516 if s.cfg.Gossip != nil { 1517 // Register update channel for any changes to the system config. 1518 // This may trigger splits along structured boundaries, 1519 // and update max range bytes. 1520 gossipUpdateC := s.cfg.Gossip.RegisterSystemConfigChannel() 1521 s.stopper.RunWorker(ctx, func(context.Context) { 1522 for { 1523 select { 1524 case <-gossipUpdateC: 1525 cfg := s.cfg.Gossip.GetSystemConfig() 1526 s.systemGossipUpdate(cfg) 1527 case <-s.stopper.ShouldStop(): 1528 return 1529 } 1530 } 1531 }) 1532 1533 // Start a single goroutine in charge of periodically gossiping the 1534 // sentinel and first range metadata if we have a first range. 1535 // This may wake up ranges and requires everything to be set up and 1536 // running. 1537 s.startGossip() 1538 1539 // Start the scanner. The construction here makes sure that the scanner 1540 // only starts after Gossip has connected, and that it does not block Start 1541 // from returning (as doing so might prevent Gossip from ever connecting). 1542 s.stopper.RunWorker(ctx, func(context.Context) { 1543 select { 1544 case <-s.cfg.Gossip.Connected: 1545 s.scanner.Start(s.stopper) 1546 case <-s.stopper.ShouldStop(): 1547 return 1548 } 1549 }) 1550 } 1551 1552 if !s.cfg.TestingKnobs.DisableAutomaticLeaseRenewal { 1553 s.startLeaseRenewer(ctx) 1554 } 1555 1556 // Connect rangefeeds to closed timestamp updates. 1557 s.startClosedTimestampRangefeedSubscriber(ctx) 1558 1559 if s.replicateQueue != nil { 1560 s.storeRebalancer = NewStoreRebalancer( 1561 s.cfg.AmbientCtx, s.cfg.Settings, s.replicateQueue, s.replRankings) 1562 s.storeRebalancer.Start(ctx, s.stopper) 1563 } 1564 1565 // Start the storage engine compactor. 1566 if envutil.EnvOrDefaultBool("COCKROACH_ENABLE_COMPACTOR", true) { 1567 s.compactor.Start(s.AnnotateCtx(context.Background()), s.stopper) 1568 } 1569 1570 // Set the started flag (for unittests). 1571 atomic.StoreInt32(&s.started, 1) 1572 1573 return nil 1574 } 1575 1576 // WaitForInit waits for any asynchronous processes begun in Start() 1577 // to complete their initialization. In particular, this includes 1578 // gossiping. In some cases this may block until the range GC queue 1579 // has completed its scan. Only for testing. 1580 func (s *Store) WaitForInit() { 1581 s.initComplete.Wait() 1582 } 1583 1584 var errPeriodicGossipsDisabled = errors.New("periodic gossip is disabled") 1585 1586 // startGossip runs an infinite loop in a goroutine which regularly checks 1587 // whether the store has a first range or config replica and asks those ranges 1588 // to gossip accordingly. 1589 func (s *Store) startGossip() { 1590 wakeReplica := func(ctx context.Context, repl *Replica) error { 1591 // Acquire the range lease, which in turn triggers system data gossip 1592 // functions (e.g. MaybeGossipSystemConfig or MaybeGossipNodeLiveness). 1593 _, pErr := repl.getLeaseForGossip(ctx) 1594 return pErr.GoError() 1595 } 1596 1597 if s.cfg.TestingKnobs.DisablePeriodicGossips { 1598 wakeReplica = func(context.Context, *Replica) error { 1599 return errPeriodicGossipsDisabled 1600 } 1601 } 1602 1603 gossipFns := []struct { 1604 key roachpb.Key 1605 fn func(context.Context, *Replica) error 1606 description string 1607 interval time.Duration 1608 }{ 1609 { 1610 key: roachpb.KeyMin, 1611 fn: func(ctx context.Context, repl *Replica) error { 1612 // The first range is gossiped by all replicas, not just the lease 1613 // holder, so wakeReplica is not used here. 1614 return repl.maybeGossipFirstRange(ctx).GoError() 1615 }, 1616 description: "first range descriptor", 1617 interval: s.cfg.SentinelGossipTTL() / 2, 1618 }, 1619 { 1620 key: keys.SystemConfigSpan.Key, 1621 fn: wakeReplica, 1622 description: "system config", 1623 interval: systemDataGossipInterval, 1624 }, 1625 { 1626 key: keys.NodeLivenessSpan.Key, 1627 fn: wakeReplica, 1628 description: "node liveness", 1629 interval: systemDataGossipInterval, 1630 }, 1631 } 1632 1633 // Periodic updates run in a goroutine and signal a WaitGroup upon completion 1634 // of their first iteration. 1635 s.initComplete.Add(len(gossipFns)) 1636 for _, gossipFn := range gossipFns { 1637 gossipFn := gossipFn // per-iteration copy 1638 s.stopper.RunWorker(context.Background(), func(ctx context.Context) { 1639 ticker := time.NewTicker(gossipFn.interval) 1640 defer ticker.Stop() 1641 for first := true; ; { 1642 // Retry in a backoff loop until gossipFn succeeds. The gossipFn might 1643 // temporarily fail (e.g. because node liveness hasn't initialized yet 1644 // making it impossible to get an epoch-based range lease), in which 1645 // case we want to retry quickly. 1646 retryOptions := base.DefaultRetryOptions() 1647 retryOptions.Closer = s.stopper.ShouldStop() 1648 for r := retry.Start(retryOptions); r.Next(); { 1649 if repl := s.LookupReplica(roachpb.RKey(gossipFn.key)); repl != nil { 1650 annotatedCtx := repl.AnnotateCtx(ctx) 1651 if err := gossipFn.fn(annotatedCtx, repl); err != nil { 1652 log.Warningf(annotatedCtx, "could not gossip %s: %+v", gossipFn.description, err) 1653 if !errors.Is(err, errPeriodicGossipsDisabled) { 1654 continue 1655 } 1656 } 1657 } 1658 break 1659 } 1660 if first { 1661 first = false 1662 s.initComplete.Done() 1663 } 1664 select { 1665 case <-ticker.C: 1666 case <-s.stopper.ShouldStop(): 1667 return 1668 } 1669 } 1670 }) 1671 } 1672 } 1673 1674 // startLeaseRenewer runs an infinite loop in a goroutine which regularly 1675 // checks whether the store has any expiration-based leases that should be 1676 // proactively renewed and attempts to continue renewing them. 1677 // 1678 // This reduces user-visible latency when range lookups are needed to serve a 1679 // request and reduces ping-ponging of r1's lease to different replicas as 1680 // maybeGossipFirstRange is called on each (e.g. #24753). 1681 func (s *Store) startLeaseRenewer(ctx context.Context) { 1682 // Start a goroutine that watches and proactively renews certain 1683 // expiration-based leases. 1684 s.stopper.RunWorker(ctx, func(ctx context.Context) { 1685 repls := make(map[*Replica]struct{}) 1686 timer := timeutil.NewTimer() 1687 defer timer.Stop() 1688 1689 // Determine how frequently to attempt to ensure that we have each lease. 1690 // The divisor used here is somewhat arbitrary, but needs to be large 1691 // enough to ensure we'll attempt to renew the lease reasonably early 1692 // within the RangeLeaseRenewalDuration time window. This means we'll wake 1693 // up more often that strictly necessary, but it's more maintainable than 1694 // attempting to accurately determine exactly when each iteration of a 1695 // lease expires and when we should attempt to renew it as a result. 1696 renewalDuration := s.cfg.RangeLeaseActiveDuration() / 5 1697 for { 1698 s.renewableLeases.Range(func(k int64, v unsafe.Pointer) bool { 1699 repl := (*Replica)(v) 1700 annotatedCtx := repl.AnnotateCtx(ctx) 1701 if _, pErr := repl.redirectOnOrAcquireLease(annotatedCtx); pErr != nil { 1702 if _, ok := pErr.GetDetail().(*roachpb.NotLeaseHolderError); !ok { 1703 log.Warningf(annotatedCtx, "failed to proactively renew lease: %s", pErr) 1704 } 1705 s.renewableLeases.Delete(k) 1706 } 1707 return true 1708 }) 1709 1710 if len(repls) > 0 { 1711 timer.Reset(renewalDuration) 1712 } 1713 select { 1714 case <-s.renewableLeasesSignal: 1715 case <-timer.C: 1716 timer.Read = true 1717 case <-s.stopper.ShouldStop(): 1718 return 1719 } 1720 } 1721 }) 1722 } 1723 1724 // startClosedTimestampRangefeedSubscriber establishes a new ClosedTimestamp 1725 // subscription and runs an infinite loop to listen for closed timestamp updates 1726 // and inform Replicas with active Rangefeeds about them. 1727 func (s *Store) startClosedTimestampRangefeedSubscriber(ctx context.Context) { 1728 // NB: We can't use Stopper.RunWorker because doing so would race with 1729 // calling Stopper.Stop. We give the subscription channel a small capacity 1730 // to avoid blocking the closed timestamp goroutine. 1731 ch := make(chan ctpb.Entry, 8) 1732 const name = "closedts-rangefeed-subscriber" 1733 if err := s.stopper.RunAsyncTask(ctx, name, func(ctx context.Context) { 1734 s.cfg.ClosedTimestamp.Provider.Subscribe(ctx, ch) 1735 }); err != nil { 1736 return 1737 } 1738 1739 s.stopper.RunWorker(ctx, func(ctx context.Context) { 1740 var replIDs []roachpb.RangeID 1741 for { 1742 select { 1743 case <-ch: 1744 // Drain all notifications from the channel. 1745 loop: 1746 for { 1747 select { 1748 case _, ok := <-ch: 1749 if !ok { 1750 break loop 1751 } 1752 default: 1753 break loop 1754 } 1755 } 1756 1757 // Gather replicas to notify under lock. 1758 s.rangefeedReplicas.Lock() 1759 for replID := range s.rangefeedReplicas.m { 1760 replIDs = append(replIDs, replID) 1761 } 1762 s.rangefeedReplicas.Unlock() 1763 1764 // Notify each replica with an active rangefeed to 1765 // check for an updated closed timestamp. 1766 for _, replID := range replIDs { 1767 repl, err := s.GetReplica(replID) 1768 if err != nil { 1769 continue 1770 } 1771 repl.handleClosedTimestampUpdate(ctx) 1772 } 1773 replIDs = replIDs[:0] 1774 case <-s.stopper.ShouldQuiesce(): 1775 return 1776 } 1777 } 1778 }) 1779 } 1780 1781 func (s *Store) addReplicaWithRangefeed(rangeID roachpb.RangeID) { 1782 s.rangefeedReplicas.Lock() 1783 s.rangefeedReplicas.m[rangeID] = struct{}{} 1784 s.rangefeedReplicas.Unlock() 1785 } 1786 1787 func (s *Store) removeReplicaWithRangefeed(rangeID roachpb.RangeID) { 1788 s.rangefeedReplicas.Lock() 1789 delete(s.rangefeedReplicas.m, rangeID) 1790 s.rangefeedReplicas.Unlock() 1791 } 1792 1793 // systemGossipUpdate is a callback for gossip updates to 1794 // the system config which affect range split boundaries. 1795 func (s *Store) systemGossipUpdate(sysCfg *config.SystemConfig) { 1796 ctx := s.AnnotateCtx(context.Background()) 1797 s.computeInitialMetrics.Do(func() { 1798 // Metrics depend in part on the system config. Compute them as soon as we 1799 // get the first system config, then periodically in the background 1800 // (managed by the Node). 1801 if err := s.ComputeMetrics(ctx, -1); err != nil { 1802 log.Infof(ctx, "%s: failed initial metrics computation: %s", s, err) 1803 } 1804 log.Event(ctx, "computed initial metrics") 1805 }) 1806 1807 // We'll want to offer all replicas to the split and merge queues. Be a little 1808 // careful about not spawning too many individual goroutines. 1809 1810 // For every range, update its zone config and check if it needs to 1811 // be split or merged. 1812 now := s.cfg.Clock.Now() 1813 newStoreReplicaVisitor(s).Visit(func(repl *Replica) bool { 1814 key := repl.Desc().StartKey 1815 zone, err := sysCfg.GetZoneConfigForKey(key) 1816 if err != nil { 1817 if log.V(1) { 1818 log.Infof(context.TODO(), "failed to get zone config for key %s", key) 1819 } 1820 zone = s.cfg.DefaultZoneConfig 1821 } 1822 repl.SetZoneConfig(zone) 1823 s.splitQueue.Async(ctx, "gossip update", true /* wait */, func(ctx context.Context, h queueHelper) { 1824 h.MaybeAdd(ctx, repl, now) 1825 }) 1826 s.mergeQueue.Async(ctx, "gossip update", true /* wait */, func(ctx context.Context, h queueHelper) { 1827 h.MaybeAdd(ctx, repl, now) 1828 }) 1829 return true // more 1830 }) 1831 } 1832 1833 func (s *Store) asyncGossipStore(ctx context.Context, reason string, useCached bool) { 1834 if err := s.stopper.RunAsyncTask( 1835 ctx, fmt.Sprintf("storage.Store: gossip on %s", reason), 1836 func(ctx context.Context) { 1837 if err := s.GossipStore(ctx, useCached); err != nil { 1838 log.Warningf(ctx, "error gossiping on %s: %+v", reason, err) 1839 } 1840 }); err != nil { 1841 log.Warningf(ctx, "unable to gossip on %s: %+v", reason, err) 1842 } 1843 } 1844 1845 // GossipStore broadcasts the store on the gossip network. 1846 func (s *Store) GossipStore(ctx context.Context, useCached bool) error { 1847 // Temporarily indicate that we're gossiping the store capacity to avoid 1848 // recursively triggering a gossip of the store capacity. 1849 syncutil.StoreFloat64(&s.gossipQueriesPerSecondVal, -1) 1850 syncutil.StoreFloat64(&s.gossipWritesPerSecondVal, -1) 1851 1852 storeDesc, err := s.Descriptor(useCached) 1853 if err != nil { 1854 return errors.Wrapf(err, "problem getting store descriptor for store %+v", s.Ident) 1855 } 1856 1857 // Set countdown target for re-gossiping capacity earlier than 1858 // the usual periodic interval. Re-gossip more rapidly for RangeCount 1859 // changes because allocators with stale information are much more 1860 // likely to make bad decisions. 1861 rangeCountdown := float64(storeDesc.Capacity.RangeCount) * s.cfg.GossipWhenCapacityDeltaExceedsFraction 1862 atomic.StoreInt32(&s.gossipRangeCountdown, int32(math.Ceil(math.Min(rangeCountdown, 3)))) 1863 leaseCountdown := float64(storeDesc.Capacity.LeaseCount) * s.cfg.GossipWhenCapacityDeltaExceedsFraction 1864 atomic.StoreInt32(&s.gossipLeaseCountdown, int32(math.Ceil(math.Max(leaseCountdown, 1)))) 1865 syncutil.StoreFloat64(&s.gossipQueriesPerSecondVal, storeDesc.Capacity.QueriesPerSecond) 1866 syncutil.StoreFloat64(&s.gossipWritesPerSecondVal, storeDesc.Capacity.WritesPerSecond) 1867 1868 // Unique gossip key per store. 1869 gossipStoreKey := gossip.MakeStoreKey(storeDesc.StoreID) 1870 // Gossip store descriptor. 1871 return s.cfg.Gossip.AddInfoProto(gossipStoreKey, storeDesc, gossip.StoreTTL) 1872 } 1873 1874 type capacityChangeEvent int 1875 1876 const ( 1877 rangeAddEvent capacityChangeEvent = iota 1878 rangeRemoveEvent 1879 leaseAddEvent 1880 leaseRemoveEvent 1881 ) 1882 1883 // maybeGossipOnCapacityChange decrements the countdown on range 1884 // and leaseholder counts. If it reaches 0, then we trigger an 1885 // immediate gossip of this store's descriptor, to include updated 1886 // capacity information. 1887 func (s *Store) maybeGossipOnCapacityChange(ctx context.Context, cce capacityChangeEvent) { 1888 if s.cfg.TestingKnobs.DisableLeaseCapacityGossip && (cce == leaseAddEvent || cce == leaseRemoveEvent) { 1889 return 1890 } 1891 1892 // Incrementally adjust stats to keep them up to date even if the 1893 // capacity is gossiped, but isn't due yet to be recomputed from scratch. 1894 s.cachedCapacity.Lock() 1895 switch cce { 1896 case rangeAddEvent: 1897 s.cachedCapacity.RangeCount++ 1898 case rangeRemoveEvent: 1899 s.cachedCapacity.RangeCount-- 1900 case leaseAddEvent: 1901 s.cachedCapacity.LeaseCount++ 1902 case leaseRemoveEvent: 1903 s.cachedCapacity.LeaseCount-- 1904 } 1905 s.cachedCapacity.Unlock() 1906 1907 if ((cce == rangeAddEvent || cce == rangeRemoveEvent) && atomic.AddInt32(&s.gossipRangeCountdown, -1) == 0) || 1908 ((cce == leaseAddEvent || cce == leaseRemoveEvent) && atomic.AddInt32(&s.gossipLeaseCountdown, -1) == 0) { 1909 // Reset countdowns to avoid unnecessary gossiping. 1910 atomic.StoreInt32(&s.gossipRangeCountdown, 0) 1911 atomic.StoreInt32(&s.gossipLeaseCountdown, 0) 1912 s.asyncGossipStore(ctx, "capacity change", true /* useCached */) 1913 } 1914 } 1915 1916 // recordNewPerSecondStats takes recently calculated values for the number of 1917 // queries and key writes the store is handling and decides whether either has 1918 // changed enough to justify re-gossiping the store's capacity. 1919 func (s *Store) recordNewPerSecondStats(newQPS, newWPS float64) { 1920 oldQPS := syncutil.LoadFloat64(&s.gossipQueriesPerSecondVal) 1921 oldWPS := syncutil.LoadFloat64(&s.gossipWritesPerSecondVal) 1922 if oldQPS == -1 || oldWPS == -1 { 1923 // Gossiping of store capacity is already ongoing. 1924 return 1925 } 1926 1927 const minAbsoluteChange = 100 1928 updateForQPS := (newQPS < oldQPS*.5 || newQPS > oldQPS*1.5) && math.Abs(newQPS-oldQPS) > minAbsoluteChange 1929 updateForWPS := (newWPS < oldWPS*.5 || newWPS > oldWPS*1.5) && math.Abs(newWPS-oldWPS) > minAbsoluteChange 1930 1931 if !updateForQPS && !updateForWPS { 1932 return 1933 } 1934 1935 var message string 1936 if updateForQPS && updateForWPS { 1937 message = "queries-per-second and writes-per-second change" 1938 } else if updateForQPS { 1939 message = "queries-per-second change" 1940 } else { 1941 message = "writes-per-second change" 1942 } 1943 // TODO(a-robinson): Use the provided values to avoid having to recalculate 1944 // them in GossipStore. 1945 s.asyncGossipStore(context.TODO(), message, false /* useCached */) 1946 } 1947 1948 // VisitReplicas invokes the visitor on the Store's Replicas until the visitor returns false. 1949 // Replicas which are added to the Store after iteration begins may or may not be observed. 1950 func (s *Store) VisitReplicas(visitor func(*Replica) (wantMore bool)) { 1951 v := newStoreReplicaVisitor(s) 1952 v.Visit(visitor) 1953 } 1954 1955 // WriteLastUpTimestamp records the supplied timestamp into the "last up" key 1956 // on this store. This value should be refreshed whenever this store's node 1957 // updates its own liveness record; it is used by a restarting store to 1958 // determine the approximate time that it stopped. 1959 func (s *Store) WriteLastUpTimestamp(ctx context.Context, time hlc.Timestamp) error { 1960 ctx = s.AnnotateCtx(ctx) 1961 return storage.MVCCPutProto( 1962 ctx, 1963 s.engine, 1964 nil, 1965 keys.StoreLastUpKey(), 1966 hlc.Timestamp{}, 1967 nil, 1968 &time, 1969 ) 1970 } 1971 1972 // ReadLastUpTimestamp returns the "last up" timestamp recorded in this store. 1973 // This value can be used to approximate the last time the engine was was being 1974 // served as a store by a running node. If the store does not contain a "last 1975 // up" timestamp (for example, on a newly bootstrapped store), the zero 1976 // timestamp is returned instead. 1977 func (s *Store) ReadLastUpTimestamp(ctx context.Context) (hlc.Timestamp, error) { 1978 var timestamp hlc.Timestamp 1979 ok, err := storage.MVCCGetProto(ctx, s.Engine(), keys.StoreLastUpKey(), hlc.Timestamp{}, 1980 ×tamp, storage.MVCCGetOptions{}) 1981 if err != nil { 1982 return hlc.Timestamp{}, err 1983 } else if !ok { 1984 return hlc.Timestamp{}, nil 1985 } 1986 return timestamp, nil 1987 } 1988 1989 // WriteHLCUpperBound records an upper bound to the wall time of the HLC 1990 func (s *Store) WriteHLCUpperBound(ctx context.Context, time int64) error { 1991 ctx = s.AnnotateCtx(ctx) 1992 ts := hlc.Timestamp{WallTime: time} 1993 batch := s.Engine().NewBatch() 1994 // Write has to sync to disk to ensure HLC monotonicity across restarts 1995 defer batch.Close() 1996 if err := storage.MVCCPutProto( 1997 ctx, 1998 batch, 1999 nil, 2000 keys.StoreHLCUpperBoundKey(), 2001 hlc.Timestamp{}, 2002 nil, 2003 &ts, 2004 ); err != nil { 2005 return err 2006 } 2007 2008 if err := batch.Commit(true /* sync */); err != nil { 2009 return err 2010 } 2011 return nil 2012 } 2013 2014 // ReadHLCUpperBound returns the upper bound to the wall time of the HLC 2015 // If this value does not exist 0 is returned 2016 func ReadHLCUpperBound(ctx context.Context, e storage.Engine) (int64, error) { 2017 var timestamp hlc.Timestamp 2018 ok, err := storage.MVCCGetProto(ctx, e, keys.StoreHLCUpperBoundKey(), hlc.Timestamp{}, 2019 ×tamp, storage.MVCCGetOptions{}) 2020 if err != nil { 2021 return 0, err 2022 } else if !ok { 2023 return 0, nil 2024 } 2025 return timestamp.WallTime, nil 2026 } 2027 2028 // ReadMaxHLCUpperBound returns the maximum of the stored hlc upper bounds 2029 // among all the engines. This value is optionally persisted by the server and 2030 // it is guaranteed to be higher than any wall time used by the HLC. If this 2031 // value is persisted, HLC wall clock monotonicity is guaranteed across server 2032 // restarts 2033 func ReadMaxHLCUpperBound(ctx context.Context, engines []storage.Engine) (int64, error) { 2034 var hlcUpperBound int64 2035 for _, e := range engines { 2036 engineHLCUpperBound, err := ReadHLCUpperBound(ctx, e) 2037 if err != nil { 2038 return 0, err 2039 } 2040 if engineHLCUpperBound > hlcUpperBound { 2041 hlcUpperBound = engineHLCUpperBound 2042 } 2043 } 2044 return hlcUpperBound, nil 2045 } 2046 2047 // checkCanInitializeEngine ensures that the engine is empty except for a 2048 // cluster version, which must be present. 2049 func checkCanInitializeEngine(ctx context.Context, eng storage.Engine) error { 2050 kvs, err := storage.Scan(eng, roachpb.KeyMin, roachpb.KeyMax, 10) 2051 if err != nil { 2052 return err 2053 } 2054 // See if this is an already-bootstrapped store. 2055 ident, err := ReadStoreIdent(ctx, eng) 2056 if err == nil { 2057 return errors.Errorf("engine already initialized as %s", ident.String()) 2058 } else if !errors.HasType(err, (*NotBootstrappedError)(nil)) { 2059 return errors.Wrap(err, "unable to read store ident") 2060 } 2061 2062 // Engine is not bootstrapped yet (i.e. no StoreIdent). Does it contain 2063 // a cluster version and nothing else? 2064 2065 var sawClusterVersion bool 2066 var keyVals []string 2067 for _, kv := range kvs { 2068 if kv.Key.Key.Equal(keys.StoreClusterVersionKey()) { 2069 sawClusterVersion = true 2070 continue 2071 } 2072 keyVals = append(keyVals, fmt.Sprintf("%s: %q", kv.Key, kv.Value)) 2073 } 2074 if len(keyVals) > 0 { 2075 return errors.Errorf("engine cannot be bootstrapped, contains:\n%s", keyVals) 2076 } 2077 if !sawClusterVersion { 2078 return errors.New("no cluster version found on uninitialized engine") 2079 } 2080 2081 return nil 2082 } 2083 2084 // GetReplica fetches a replica by Range ID. Returns an error if no replica is found. 2085 func (s *Store) GetReplica(rangeID roachpb.RangeID) (*Replica, error) { 2086 if value, ok := s.mu.replicas.Load(int64(rangeID)); ok { 2087 return (*Replica)(value), nil 2088 } 2089 return nil, roachpb.NewRangeNotFoundError(rangeID, s.StoreID()) 2090 } 2091 2092 // LookupReplica looks up the replica that contains the specified key. It 2093 // returns nil if no such replica exists. 2094 func (s *Store) LookupReplica(key roachpb.RKey) *Replica { 2095 s.mu.RLock() 2096 defer s.mu.RUnlock() 2097 var repl *Replica 2098 s.mu.replicasByKey.DescendLessOrEqual(rangeBTreeKey(key), func(item btree.Item) bool { 2099 repl, _ = item.(*Replica) 2100 // Stop iterating immediately. The first item we see is the only one that 2101 // can possibly contain key. 2102 return false 2103 }) 2104 if repl == nil || !repl.Desc().ContainsKey(key) { 2105 return nil 2106 } 2107 return repl 2108 } 2109 2110 // lookupPrecedingReplica finds the replica in this store that immediately 2111 // precedes the specified key without containing it. It returns nil if no such 2112 // replica exists. It ignores replica placeholders. 2113 // 2114 // Concretely, when key represents a key within replica R, 2115 // lookupPrecedingReplica returns the replica that immediately precedes R in 2116 // replicasByKey. 2117 func (s *Store) lookupPrecedingReplica(key roachpb.RKey) *Replica { 2118 s.mu.RLock() 2119 defer s.mu.RUnlock() 2120 var repl *Replica 2121 s.mu.replicasByKey.DescendLessOrEqual(rangeBTreeKey(key), func(item btree.Item) bool { 2122 if r, ok := item.(*Replica); ok && !r.ContainsKey(key.AsRawKey()) { 2123 repl = r 2124 return false // stop iterating 2125 } 2126 return true // keep iterating 2127 }) 2128 return repl 2129 } 2130 2131 // getOverlappingKeyRangeLocked returns a KeyRange from the Store overlapping the given 2132 // descriptor (or nil if no such KeyRange exists). 2133 func (s *Store) getOverlappingKeyRangeLocked(rngDesc *roachpb.RangeDescriptor) KeyRange { 2134 var kr KeyRange 2135 s.mu.replicasByKey.DescendLessOrEqual(rangeBTreeKey(rngDesc.EndKey), 2136 func(item btree.Item) bool { 2137 if kr0 := item.(KeyRange); kr0.startKey().Less(rngDesc.EndKey) { 2138 kr = kr0 2139 return false // stop iterating 2140 } 2141 return true // keep iterating 2142 }) 2143 if kr != nil && rngDesc.StartKey.Less(kr.Desc().EndKey) { 2144 return kr 2145 } 2146 return nil 2147 } 2148 2149 // RaftStatus returns the current raft status of the local replica of 2150 // the given range. 2151 func (s *Store) RaftStatus(rangeID roachpb.RangeID) *raft.Status { 2152 if value, ok := s.mu.replicas.Load(int64(rangeID)); ok { 2153 return (*Replica)(value).RaftStatus() 2154 } 2155 return nil 2156 } 2157 2158 // ClusterID accessor. 2159 func (s *Store) ClusterID() uuid.UUID { return s.Ident.ClusterID } 2160 2161 // StoreID accessor. 2162 func (s *Store) StoreID() roachpb.StoreID { return s.Ident.StoreID } 2163 2164 // Clock accessor. 2165 func (s *Store) Clock() *hlc.Clock { return s.cfg.Clock } 2166 2167 // Engine accessor. 2168 func (s *Store) Engine() storage.Engine { return s.engine } 2169 2170 // DB accessor. 2171 func (s *Store) DB() *kv.DB { return s.cfg.DB } 2172 2173 // Gossip accessor. 2174 func (s *Store) Gossip() *gossip.Gossip { return s.cfg.Gossip } 2175 2176 // Compactor accessor. 2177 func (s *Store) Compactor() *compactor.Compactor { return s.compactor } 2178 2179 // Stopper accessor. 2180 func (s *Store) Stopper() *stop.Stopper { return s.stopper } 2181 2182 // TestingKnobs accessor. 2183 func (s *Store) TestingKnobs() *StoreTestingKnobs { return &s.cfg.TestingKnobs } 2184 2185 // IsDraining accessor. 2186 func (s *Store) IsDraining() bool { 2187 return s.draining.Load().(bool) 2188 } 2189 2190 // AllocateRangeID allocates a new RangeID from the cluster-wide RangeID allocator. 2191 func (s *Store) AllocateRangeID(ctx context.Context) (roachpb.RangeID, error) { 2192 id, err := s.rangeIDAlloc.Allocate(ctx) 2193 if err != nil { 2194 return 0, err 2195 } 2196 return roachpb.RangeID(id), nil 2197 } 2198 2199 // Attrs returns the attributes of the underlying store. 2200 func (s *Store) Attrs() roachpb.Attributes { 2201 return s.engine.Attrs() 2202 } 2203 2204 // Capacity returns the capacity of the underlying storage engine. Note that 2205 // this does not include reservations. 2206 // Note that Capacity() has the side effect of updating some of the store's 2207 // internal statistics about its replicas. 2208 func (s *Store) Capacity(useCached bool) (roachpb.StoreCapacity, error) { 2209 if useCached { 2210 s.cachedCapacity.Lock() 2211 capacity := s.cachedCapacity.StoreCapacity 2212 s.cachedCapacity.Unlock() 2213 if capacity != (roachpb.StoreCapacity{}) { 2214 return capacity, nil 2215 } 2216 } 2217 2218 capacity, err := s.engine.Capacity() 2219 if err != nil { 2220 return capacity, err 2221 } 2222 2223 now := s.cfg.Clock.Now() 2224 var leaseCount int32 2225 var rangeCount int32 2226 var logicalBytes int64 2227 var totalQueriesPerSecond float64 2228 var totalWritesPerSecond float64 2229 replicaCount := s.metrics.ReplicaCount.Value() 2230 bytesPerReplica := make([]float64, 0, replicaCount) 2231 writesPerReplica := make([]float64, 0, replicaCount) 2232 rankingsAccumulator := s.replRankings.newAccumulator() 2233 newStoreReplicaVisitor(s).Visit(func(r *Replica) bool { 2234 rangeCount++ 2235 if r.OwnsValidLease(now) { 2236 leaseCount++ 2237 } 2238 mvccStats := r.GetMVCCStats() 2239 logicalBytes += mvccStats.Total() 2240 bytesPerReplica = append(bytesPerReplica, float64(mvccStats.Total())) 2241 // TODO(a-robinson): How dangerous is it that these numbers will be 2242 // incorrectly low the first time or two it gets gossiped when a store 2243 // starts? We can't easily have a countdown as its value changes like for 2244 // leases/replicas. 2245 var qps float64 2246 if avgQPS, dur := r.leaseholderStats.avgQPS(); dur >= MinStatsDuration { 2247 qps = avgQPS 2248 totalQueriesPerSecond += avgQPS 2249 // TODO(a-robinson): Calculate percentiles for qps? Get rid of other percentiles? 2250 } 2251 if wps, dur := r.writeStats.avgQPS(); dur >= MinStatsDuration { 2252 totalWritesPerSecond += wps 2253 writesPerReplica = append(writesPerReplica, wps) 2254 } 2255 rankingsAccumulator.addReplica(replicaWithStats{ 2256 repl: r, 2257 qps: qps, 2258 }) 2259 return true 2260 }) 2261 capacity.RangeCount = rangeCount 2262 capacity.LeaseCount = leaseCount 2263 capacity.LogicalBytes = logicalBytes 2264 capacity.QueriesPerSecond = totalQueriesPerSecond 2265 capacity.WritesPerSecond = totalWritesPerSecond 2266 capacity.BytesPerReplica = roachpb.PercentilesFromData(bytesPerReplica) 2267 capacity.WritesPerReplica = roachpb.PercentilesFromData(writesPerReplica) 2268 s.recordNewPerSecondStats(totalQueriesPerSecond, totalWritesPerSecond) 2269 s.replRankings.update(rankingsAccumulator) 2270 2271 s.cachedCapacity.Lock() 2272 s.cachedCapacity.StoreCapacity = capacity 2273 s.cachedCapacity.Unlock() 2274 2275 return capacity, nil 2276 } 2277 2278 // ReplicaCount returns the number of replicas contained by this store. This 2279 // method is O(n) in the number of replicas and should not be called from 2280 // performance critical code. 2281 func (s *Store) ReplicaCount() int { 2282 var count int 2283 s.mu.replicas.Range(func(_ int64, _ unsafe.Pointer) bool { 2284 count++ 2285 return true 2286 }) 2287 return count 2288 } 2289 2290 // Registry returns the store registry. 2291 func (s *Store) Registry() *metric.Registry { 2292 return s.metrics.registry 2293 } 2294 2295 // Metrics returns the store's metric struct. 2296 func (s *Store) Metrics() *StoreMetrics { 2297 return s.metrics 2298 } 2299 2300 // Descriptor returns a StoreDescriptor including current store 2301 // capacity information. 2302 func (s *Store) Descriptor(useCached bool) (*roachpb.StoreDescriptor, error) { 2303 capacity, err := s.Capacity(useCached) 2304 if err != nil { 2305 return nil, err 2306 } 2307 2308 // Initialize the store descriptor. 2309 return &roachpb.StoreDescriptor{ 2310 StoreID: s.Ident.StoreID, 2311 Attrs: s.Attrs(), 2312 Node: *s.nodeDesc, 2313 Capacity: capacity, 2314 }, nil 2315 } 2316 2317 // RangeFeed registers a rangefeed over the specified span. It sends updates to 2318 // the provided stream and returns with an optional error when the rangefeed is 2319 // complete. 2320 func (s *Store) RangeFeed( 2321 args *roachpb.RangeFeedRequest, stream roachpb.Internal_RangeFeedServer, 2322 ) *roachpb.Error { 2323 2324 if filter := s.TestingKnobs().TestingRangefeedFilter; filter != nil { 2325 if pErr := filter(args, stream); pErr != nil { 2326 return pErr 2327 } 2328 } 2329 2330 if err := verifyKeys(args.Span.Key, args.Span.EndKey, true); err != nil { 2331 return roachpb.NewError(err) 2332 } 2333 2334 // Get range and add command to the range for execution. 2335 repl, err := s.GetReplica(args.RangeID) 2336 if err != nil { 2337 return roachpb.NewError(err) 2338 } 2339 if !repl.IsInitialized() { 2340 // (*Store).Send has an optimization for uninitialized replicas to send back 2341 // a NotLeaseHolderError with a hint of where an initialized replica might 2342 // be found. RangeFeeds can always be served from followers and so don't 2343 // otherwise return NotLeaseHolderError. For simplicity we also don't return 2344 // one here. 2345 return roachpb.NewError(roachpb.NewRangeNotFoundError(args.RangeID, s.StoreID())) 2346 } 2347 return repl.RangeFeed(args, stream) 2348 } 2349 2350 // updateReplicationGauges counts a number of simple replication statistics for 2351 // the ranges in this store. 2352 // TODO(bram): #4564 It may be appropriate to compute these statistics while 2353 // scanning ranges. An ideal solution would be to create incremental events 2354 // whenever availability changes. 2355 func (s *Store) updateReplicationGauges(ctx context.Context) error { 2356 // Load the system config. 2357 cfg := s.Gossip().GetSystemConfig() 2358 if cfg == nil { 2359 return errors.Errorf("%s: system config not yet available", s) 2360 } 2361 2362 var ( 2363 raftLeaderCount int64 2364 leaseHolderCount int64 2365 leaseExpirationCount int64 2366 leaseEpochCount int64 2367 raftLeaderNotLeaseHolderCount int64 2368 quiescentCount int64 2369 averageQueriesPerSecond float64 2370 averageWritesPerSecond float64 2371 2372 rangeCount int64 2373 unavailableRangeCount int64 2374 underreplicatedRangeCount int64 2375 overreplicatedRangeCount int64 2376 behindCount int64 2377 ) 2378 2379 timestamp := s.cfg.Clock.Now() 2380 var livenessMap IsLiveMap 2381 if s.cfg.NodeLiveness != nil { 2382 livenessMap = s.cfg.NodeLiveness.GetIsLiveMap() 2383 } 2384 clusterNodes := s.ClusterNodeCount() 2385 2386 var minMaxClosedTS hlc.Timestamp 2387 newStoreReplicaVisitor(s).Visit(func(rep *Replica) bool { 2388 metrics := rep.Metrics(ctx, timestamp, livenessMap, clusterNodes) 2389 if metrics.Leader { 2390 raftLeaderCount++ 2391 if metrics.LeaseValid && !metrics.Leaseholder { 2392 raftLeaderNotLeaseHolderCount++ 2393 } 2394 } 2395 if metrics.Leaseholder { 2396 leaseHolderCount++ 2397 switch metrics.LeaseType { 2398 case roachpb.LeaseNone: 2399 case roachpb.LeaseExpiration: 2400 leaseExpirationCount++ 2401 case roachpb.LeaseEpoch: 2402 leaseEpochCount++ 2403 } 2404 } 2405 if metrics.Quiescent { 2406 quiescentCount++ 2407 } 2408 if metrics.RangeCounter { 2409 rangeCount++ 2410 if metrics.Unavailable { 2411 unavailableRangeCount++ 2412 } 2413 if metrics.Underreplicated { 2414 underreplicatedRangeCount++ 2415 } 2416 if metrics.Overreplicated { 2417 overreplicatedRangeCount++ 2418 } 2419 } 2420 behindCount += metrics.BehindCount 2421 if qps, dur := rep.leaseholderStats.avgQPS(); dur >= MinStatsDuration { 2422 averageQueriesPerSecond += qps 2423 } 2424 if wps, dur := rep.writeStats.avgQPS(); dur >= MinStatsDuration { 2425 averageWritesPerSecond += wps 2426 } 2427 mc, ok := rep.maxClosed(ctx) 2428 if ok && (minMaxClosedTS.IsEmpty() || mc.Less(minMaxClosedTS)) { 2429 minMaxClosedTS = mc 2430 } 2431 return true // more 2432 }) 2433 2434 s.metrics.RaftLeaderCount.Update(raftLeaderCount) 2435 s.metrics.RaftLeaderNotLeaseHolderCount.Update(raftLeaderNotLeaseHolderCount) 2436 s.metrics.LeaseHolderCount.Update(leaseHolderCount) 2437 s.metrics.LeaseExpirationCount.Update(leaseExpirationCount) 2438 s.metrics.LeaseEpochCount.Update(leaseEpochCount) 2439 s.metrics.QuiescentCount.Update(quiescentCount) 2440 s.metrics.AverageQueriesPerSecond.Update(averageQueriesPerSecond) 2441 s.metrics.AverageWritesPerSecond.Update(averageWritesPerSecond) 2442 s.recordNewPerSecondStats(averageQueriesPerSecond, averageWritesPerSecond) 2443 2444 s.metrics.RangeCount.Update(rangeCount) 2445 s.metrics.UnavailableRangeCount.Update(unavailableRangeCount) 2446 s.metrics.UnderReplicatedRangeCount.Update(underreplicatedRangeCount) 2447 s.metrics.OverReplicatedRangeCount.Update(overreplicatedRangeCount) 2448 s.metrics.RaftLogFollowerBehindCount.Update(behindCount) 2449 2450 if !minMaxClosedTS.IsEmpty() { 2451 nanos := timeutil.Since(minMaxClosedTS.GoTime()).Nanoseconds() 2452 s.metrics.ClosedTimestampMaxBehindNanos.Update(nanos) 2453 } 2454 2455 return nil 2456 } 2457 2458 // checkpoint creates a RocksDB checkpoint in the auxiliary directory with the 2459 // provided tag used in the filepath. The filepath for the checkpoint directory 2460 // is returned. 2461 func (s *Store) checkpoint(ctx context.Context, tag string) (string, error) { 2462 checkpointBase := filepath.Join(s.engine.GetAuxiliaryDir(), "checkpoints") 2463 _ = os.MkdirAll(checkpointBase, 0700) 2464 2465 checkpointDir := filepath.Join(checkpointBase, tag) 2466 if err := s.engine.CreateCheckpoint(checkpointDir); err != nil { 2467 return "", err 2468 } 2469 2470 return checkpointDir, nil 2471 } 2472 2473 // ComputeMetrics immediately computes the current value of store metrics which 2474 // cannot be computed incrementally. This method should be invoked periodically 2475 // by a higher-level system which records store metrics. 2476 // 2477 // The tick argument should increment across repeated calls to this 2478 // method. It is used to compute some metrics less frequently than others. 2479 func (s *Store) ComputeMetrics(ctx context.Context, tick int) error { 2480 ctx = s.AnnotateCtx(ctx) 2481 if err := s.updateCapacityGauges(); err != nil { 2482 return err 2483 } 2484 if err := s.updateReplicationGauges(ctx); err != nil { 2485 return err 2486 } 2487 2488 // Get the latest RocksDB stats. 2489 stats, err := s.engine.GetStats() 2490 if err != nil { 2491 return err 2492 } 2493 s.metrics.updateRocksDBStats(*stats) 2494 2495 // Get engine Env stats. 2496 envStats, err := s.engine.GetEnvStats() 2497 if err != nil { 2498 return err 2499 } 2500 s.metrics.updateEnvStats(*envStats) 2501 2502 sstables := s.engine.GetSSTables() 2503 s.metrics.RdbNumSSTables.Update(int64(sstables.Len())) 2504 readAmp := sstables.ReadAmplification() 2505 s.metrics.RdbReadAmplification.Update(int64(readAmp)) 2506 s.metrics.RdbPendingCompaction.Update(stats.PendingCompactionBytesEstimate) 2507 // Log this metric infrequently (with current configurations, 2508 // every 10 minutes). Trigger on tick 1 instead of tick 0 so that 2509 // non-periodic callers of this method don't trigger expensive 2510 // stats. 2511 if tick%logSSTInfoTicks == 1 /* every 10m */ { 2512 log.Infof(ctx, "sstables (read amplification = %d):\n%s", readAmp, sstables) 2513 log.Infof(ctx, "%s", s.engine.GetCompactionStats()) 2514 } 2515 return nil 2516 } 2517 2518 // ClusterNodeCount returns this store's view of the number of nodes in the 2519 // cluster. This is the metric used for adapative zone configs; ranges will not 2520 // be reported as underreplicated if it is low. Tests that wait for full 2521 // replication by tracking the underreplicated metric must also check for the 2522 // expected ClusterNodeCount to avoid catching the cluster while the first node 2523 // is initialized but the other nodes are not. 2524 func (s *Store) ClusterNodeCount() int { 2525 return s.cfg.StorePool.ClusterNodeCount() 2526 } 2527 2528 // HotReplicaInfo contains a range descriptor and its QPS. 2529 type HotReplicaInfo struct { 2530 Desc *roachpb.RangeDescriptor 2531 QPS float64 2532 } 2533 2534 // HottestReplicas returns the hottest replicas on a store, sorted by their 2535 // QPS. Only contains ranges for which this store is the leaseholder. 2536 // 2537 // Note that this uses cached information, so it's cheap but may be slightly 2538 // out of date. 2539 func (s *Store) HottestReplicas() []HotReplicaInfo { 2540 topQPS := s.replRankings.topQPS() 2541 hotRepls := make([]HotReplicaInfo, len(topQPS)) 2542 for i := range topQPS { 2543 hotRepls[i].Desc = topQPS[i].repl.Desc() 2544 hotRepls[i].QPS = topQPS[i].qps 2545 } 2546 return hotRepls 2547 } 2548 2549 // StoreKeySpanStats carries the result of a stats computation over a key range. 2550 type StoreKeySpanStats struct { 2551 ReplicaCount int 2552 MVCC enginepb.MVCCStats 2553 ApproximateDiskBytes uint64 2554 } 2555 2556 // ComputeStatsForKeySpan computes the aggregated MVCCStats for all replicas on 2557 // this store which contain any keys in the supplied range. 2558 func (s *Store) ComputeStatsForKeySpan(startKey, endKey roachpb.RKey) (StoreKeySpanStats, error) { 2559 var result StoreKeySpanStats 2560 2561 newStoreReplicaVisitor(s).Visit(func(repl *Replica) bool { 2562 desc := repl.Desc() 2563 if bytes.Compare(startKey, desc.EndKey) >= 0 || bytes.Compare(desc.StartKey, endKey) >= 0 { 2564 return true // continue 2565 } 2566 result.MVCC.Add(repl.GetMVCCStats()) 2567 result.ReplicaCount++ 2568 return true 2569 }) 2570 2571 var err error 2572 result.ApproximateDiskBytes, err = s.engine.ApproximateDiskBytes(startKey.AsRawKey(), endKey.AsRawKey()) 2573 return result, err 2574 } 2575 2576 // AllocatorDryRun runs the given replica through the allocator without actually 2577 // carrying out any changes, returning all trace messages collected along the way. 2578 // Intended to help power a debug endpoint. 2579 func (s *Store) AllocatorDryRun(ctx context.Context, repl *Replica) (tracing.Recording, error) { 2580 ctx, collect, cancel := tracing.ContextWithRecordingSpan(ctx, "allocator dry run") 2581 defer cancel() 2582 canTransferLease := func() bool { return true } 2583 _, err := s.replicateQueue.processOneChange( 2584 ctx, repl, canTransferLease, true /* dryRun */) 2585 if err != nil { 2586 log.Eventf(ctx, "error simulating allocator on replica %s: %s", repl, err) 2587 } 2588 return collect(), nil 2589 } 2590 2591 // ManuallyEnqueue runs the given replica through the requested queue, 2592 // returning all trace events collected along the way as well as the error 2593 // message returned from the queue's process method, if any. Intended to help 2594 // power an admin debug endpoint. 2595 func (s *Store) ManuallyEnqueue( 2596 ctx context.Context, queueName string, repl *Replica, skipShouldQueue bool, 2597 ) (recording tracing.Recording, processError error, enqueueError error) { 2598 ctx = repl.AnnotateCtx(ctx) 2599 2600 var queue queueImpl 2601 var needsLease bool 2602 for _, replicaQueue := range s.scanner.queues { 2603 if strings.EqualFold(replicaQueue.Name(), queueName) { 2604 queue = replicaQueue.(queueImpl) 2605 needsLease = replicaQueue.NeedsLease() 2606 } 2607 } 2608 if queue == nil { 2609 return nil, nil, errors.Errorf("unknown queue type %q", queueName) 2610 } 2611 2612 sysCfg := s.cfg.Gossip.GetSystemConfig() 2613 if sysCfg == nil { 2614 return nil, nil, errors.New("cannot run queue without a valid system config; make sure the cluster " + 2615 "has been initialized and all nodes connected to it") 2616 } 2617 2618 // Many queues are only meant to be run on leaseholder replicas, so attempt to 2619 // take the lease here or bail out early if a different replica has it. 2620 if needsLease { 2621 hasLease, pErr := repl.getLeaseForGossip(ctx) 2622 if pErr != nil { 2623 return nil, nil, pErr.GoError() 2624 } 2625 if !hasLease { 2626 return nil, errors.Newf("replica %v does not have the range lease", repl), nil 2627 } 2628 } 2629 2630 ctx, collect, cancel := tracing.ContextWithRecordingSpan( 2631 ctx, fmt.Sprintf("manual %s queue run", queueName)) 2632 defer cancel() 2633 2634 if !skipShouldQueue { 2635 log.Eventf(ctx, "running %s.shouldQueue", queueName) 2636 shouldQueue, priority := queue.shouldQueue(ctx, s.cfg.Clock.Now(), repl, sysCfg) 2637 log.Eventf(ctx, "shouldQueue=%v, priority=%f", shouldQueue, priority) 2638 if !shouldQueue { 2639 return collect(), nil, nil 2640 } 2641 } 2642 2643 log.Eventf(ctx, "running %s.process", queueName) 2644 processErr := queue.process(ctx, repl, sysCfg) 2645 return collect(), processErr, nil 2646 } 2647 2648 // GetClusterVersion reads the the cluster version from the store-local version 2649 // key. Returns an empty version if the key is not found. 2650 func (s *Store) GetClusterVersion(ctx context.Context) (clusterversion.ClusterVersion, error) { 2651 return ReadClusterVersion(ctx, s.engine) 2652 } 2653 2654 // WriteClusterVersion writes the given cluster version to the store-local cluster version key. 2655 func WriteClusterVersion( 2656 ctx context.Context, writer storage.ReadWriter, cv clusterversion.ClusterVersion, 2657 ) error { 2658 return storage.MVCCPutProto(ctx, writer, nil, keys.StoreClusterVersionKey(), hlc.Timestamp{}, nil, &cv) 2659 } 2660 2661 // ReadClusterVersion reads the the cluster version from the store-local version key. 2662 func ReadClusterVersion( 2663 ctx context.Context, reader storage.Reader, 2664 ) (clusterversion.ClusterVersion, error) { 2665 var cv clusterversion.ClusterVersion 2666 _, err := storage.MVCCGetProto(ctx, reader, keys.StoreClusterVersionKey(), hlc.Timestamp{}, 2667 &cv, storage.MVCCGetOptions{}) 2668 return cv, err 2669 } 2670 2671 func init() { 2672 tracing.RegisterTagRemapping("s", "store") 2673 }