github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/raft_log_queue.go (about) 1 // Copyright 2015 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 "fmt" 16 "sort" 17 "strings" 18 "time" 19 20 "github.com/cockroachdb/cockroach/pkg/config" 21 "github.com/cockroachdb/cockroach/pkg/gossip" 22 "github.com/cockroachdb/cockroach/pkg/kv" 23 "github.com/cockroachdb/cockroach/pkg/roachpb" 24 "github.com/cockroachdb/cockroach/pkg/util" 25 "github.com/cockroachdb/cockroach/pkg/util/hlc" 26 "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" 27 "github.com/cockroachdb/cockroach/pkg/util/log" 28 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 29 "github.com/cockroachdb/errors" 30 "go.etcd.io/etcd/raft" 31 "go.etcd.io/etcd/raft/tracker" 32 ) 33 34 const ( 35 // raftLogQueueTimerDuration is the duration between truncations. 36 raftLogQueueTimerDuration = 0 // zero duration to process truncations greedily 37 // RaftLogQueueStaleThreshold is the minimum threshold for stale raft log 38 // entries. A stale entry is one which all replicas of the range have 39 // progressed past and thus is no longer needed and can be truncated. 40 RaftLogQueueStaleThreshold = 100 41 // RaftLogQueueStaleSize is the minimum size of the Raft log that we'll 42 // truncate even if there are fewer than RaftLogQueueStaleThreshold entries 43 // to truncate. The value of 64 KB was chosen experimentally by looking at 44 // when Raft log truncation usually occurs when using the number of entries 45 // as the sole criteria. 46 RaftLogQueueStaleSize = 64 << 10 47 // Allow a limited number of Raft log truncations to be processed 48 // concurrently. 49 raftLogQueueConcurrency = 4 50 // While a snapshot is in flight, we won't truncate past the snapshot's log 51 // index. This behavior is extended to a grace period after the snapshot is 52 // marked as completed as it is applied at the receiver only a little later, 53 // leaving a window for a truncation that requires another snapshot. 54 raftLogQueuePendingSnapshotGracePeriod = 3 * time.Second 55 ) 56 57 // raftLogQueue manages a queue of replicas slated to have their raft logs 58 // truncated by removing unneeded entries. 59 type raftLogQueue struct { 60 *baseQueue 61 db *kv.DB 62 63 logSnapshots util.EveryN 64 } 65 66 // newRaftLogQueue returns a new instance of raftLogQueue. Replicas are passed 67 // to the queue both proactively (triggered by write load) and periodically 68 // (via the scanner). When processing a replica, the queue decides whether the 69 // Raft log can be truncated, which is a tradeoff between wanting to keep the 70 // log short overall and allowing slower followers to catch up before they get 71 // cut off by a truncation and need a snapshot. See newTruncateDecision for 72 // details on this decision making process. 73 func newRaftLogQueue(store *Store, db *kv.DB, gossip *gossip.Gossip) *raftLogQueue { 74 rlq := &raftLogQueue{ 75 db: db, 76 logSnapshots: util.Every(10 * time.Second), 77 } 78 rlq.baseQueue = newBaseQueue( 79 "raftlog", rlq, store, gossip, 80 queueConfig{ 81 maxSize: defaultQueueMaxSize, 82 maxConcurrency: raftLogQueueConcurrency, 83 needsLease: false, 84 needsSystemConfig: false, 85 acceptsUnsplitRanges: true, 86 successes: store.metrics.RaftLogQueueSuccesses, 87 failures: store.metrics.RaftLogQueueFailures, 88 pending: store.metrics.RaftLogQueuePending, 89 processingNanos: store.metrics.RaftLogQueueProcessingNanos, 90 }, 91 ) 92 return rlq 93 } 94 95 // newTruncateDecision returns a truncateDecision for the given Replica if no 96 // error occurs. If input data to establish a truncateDecision is missing, a 97 // zero decision is returned. 98 // 99 // At a high level, a truncate decision operates based on the Raft log size, the 100 // number of entries in the log, and the Raft status of the followers. In an 101 // ideal world and most of the time, followers are reasonably up to date, and a 102 // decision to truncate to the index acked on all replicas will be made whenever 103 // there is at least a little bit of log to truncate (think a hundred records or 104 // ~100kb of data). If followers fall behind, are offline, or are waiting for a 105 // snapshot, a second strategy is needed to make sure that the Raft log is 106 // eventually truncated: when the raft log size exceeds a limit (4mb at time of 107 // writing), truncations become willing and able to cut off followers as long as 108 // a quorum has acked the truncation index. The quota pool ensures that the delta 109 // between "acked by quorum" and "acked by all" is bounded, while Raft limits the 110 // size of the uncommitted, i.e. not "acked by quorum", part of the log; thus 111 // the "quorum" truncation strategy bounds the absolute size of the log on all 112 // followers. 113 // 114 // Exceptions are made for replicas for which information is missing ("probing 115 // state") as long as they are known to have been online recently, and for 116 // in-flight snapshots (in particular preemptive snapshots) which are not 117 // adequately reflected in the Raft status and would otherwise be cut off with 118 // regularity. Probing live followers should only remain in this state for a 119 // short moment and so we deny a log truncation outright (as there's no safe 120 // index to truncate to); for snapshots, we can still truncate, but not past 121 // the snapshot's index. 122 // 123 // A challenge for log truncation is to deal with sideloaded log entries, that 124 // is, entries which contain SSTables for direct ingestion into the storage 125 // engine. Such log entries are very large, and failing to account for them in 126 // the heuristics can trigger overly aggressive truncations. 127 // 128 // The raft log size used in the decision making process is principally updated 129 // in the main Raft command apply loop, and adds a Replica to this queue 130 // whenever the log size has increased by a non-negligible amount that would be 131 // worth truncating (~100kb). 132 // 133 // Unfortunately, the size tracking is not very robust as it suffers from two 134 // limitations at the time of writing: 135 // 1. it may undercount as it is in-memory and incremented only as proposals 136 // are handled; that is, a freshly started node will believe its Raft log to be 137 // zero-sized independent of its actual size, and 138 // 2. the addition and corresponding subtraction happen in very different places 139 // and are difficult to keep bug-free, meaning that there is low confidence that 140 // we maintain the delta in a completely accurate manner over time. One example 141 // of potential errors are sideloaded proposals, for which the subtraction needs 142 // to load the size of the file on-disk (i.e. supplied by the fs), whereas 143 // the addition uses the in-memory representation of the file. 144 // 145 // Ideally, a Raft log that grows large for whichever reason (for instance the 146 // queue being stuck on another replica) wouldn't be more than a nuisance on 147 // nodes with sufficient disk space. Unfortunately, at the time of writing, the 148 // Raft log is included in Raft snapshots. On the other hand, IMPORT/RESTORE's 149 // split/scatter phase interacts poorly with overly aggressive truncations and 150 // can DDOS the Raft snapshot queue. 151 func newTruncateDecision(ctx context.Context, r *Replica) (truncateDecision, error) { 152 rangeID := r.RangeID 153 now := timeutil.Now() 154 155 // NB: we need an exclusive lock due to grabbing the first index. 156 r.mu.Lock() 157 raftLogSize := r.mu.raftLogSize 158 // A "cooperative" truncation (i.e. one that does not cut off followers from 159 // the log) takes place whenever there are more than 160 // RaftLogQueueStaleThreshold entries or the log's estimated size is above 161 // RaftLogQueueStaleSize bytes. This is fairly aggressive, so under normal 162 // conditions, the log is very small. 163 // 164 // If followers start falling behind, at some point the logs still need to 165 // be truncated. We do this either when the size of the log exceeds 166 // RaftLogTruncationThreshold (or, in eccentric configurations, the zone's 167 // RangeMaxBytes). This captures the heuristic that at some point, it's more 168 // efficient to catch up via a snapshot than via applying a long tail of log 169 // entries. 170 targetSize := r.store.cfg.RaftLogTruncationThreshold 171 if targetSize > *r.mu.zone.RangeMaxBytes { 172 targetSize = *r.mu.zone.RangeMaxBytes 173 } 174 raftStatus := r.raftStatusRLocked() 175 176 firstIndex, err := r.raftFirstIndexLocked() 177 const anyRecipientStore roachpb.StoreID = 0 178 pendingSnapshotIndex := r.getAndGCSnapshotLogTruncationConstraintsLocked(now, anyRecipientStore) 179 lastIndex := r.mu.lastIndex 180 logSizeTrusted := r.mu.raftLogSizeTrusted 181 r.mu.Unlock() 182 183 if err != nil { 184 return truncateDecision{}, errors.Errorf("error retrieving first index for r%d: %s", rangeID, err) 185 } 186 187 if raftStatus == nil { 188 if log.V(6) { 189 log.Infof(ctx, "the raft group doesn't exist for r%d", rangeID) 190 } 191 return truncateDecision{}, nil 192 } 193 194 // Is this the raft leader? We only perform log truncation on the raft leader 195 // which has the up to date info on followers. 196 if raftStatus.RaftState != raft.StateLeader { 197 return truncateDecision{}, nil 198 } 199 200 // For all our followers, overwrite the RecentActive field (which is always 201 // true since we don't use CheckQuorum) with our own activity check. 202 r.mu.RLock() 203 log.Eventf(ctx, "raft status before lastUpdateTimes check: %+v", raftStatus.Progress) 204 log.Eventf(ctx, "lastUpdateTimes: %+v", r.mu.lastUpdateTimes) 205 updateRaftProgressFromActivity( 206 ctx, raftStatus.Progress, r.descRLocked().Replicas().All(), 207 func(replicaID roachpb.ReplicaID) bool { 208 return r.mu.lastUpdateTimes.isFollowerActiveSince( 209 ctx, replicaID, now, r.store.cfg.RangeLeaseActiveDuration()) 210 }, 211 ) 212 log.Eventf(ctx, "raft status after lastUpdateTimes check: %+v", raftStatus.Progress) 213 r.mu.RUnlock() 214 215 if pr, ok := raftStatus.Progress[raftStatus.Lead]; ok { 216 // TODO(tschottdorf): remove this line once we have picked up 217 // https://github.com/etcd-io/etcd/pull/10279 218 pr.State = tracker.StateReplicate 219 raftStatus.Progress[raftStatus.Lead] = pr 220 } 221 222 input := truncateDecisionInput{ 223 RaftStatus: *raftStatus, 224 LogSize: raftLogSize, 225 MaxLogSize: targetSize, 226 LogSizeTrusted: logSizeTrusted, 227 FirstIndex: firstIndex, 228 LastIndex: lastIndex, 229 PendingSnapshotIndex: pendingSnapshotIndex, 230 } 231 232 decision := computeTruncateDecision(input) 233 return decision, nil 234 } 235 236 func updateRaftProgressFromActivity( 237 ctx context.Context, 238 prs map[uint64]tracker.Progress, 239 replicas []roachpb.ReplicaDescriptor, 240 replicaActive func(roachpb.ReplicaID) bool, 241 ) { 242 for _, replDesc := range replicas { 243 replicaID := replDesc.ReplicaID 244 pr, ok := prs[uint64(replicaID)] 245 if !ok { 246 continue 247 } 248 pr.RecentActive = replicaActive(replicaID) 249 // Override this field for safety since we don't use it. Instead, we use 250 // pendingSnapshotIndex from above which is also populated for preemptive 251 // snapshots. 252 // 253 // NOTE: We don't rely on PendingSnapshot because PendingSnapshot is 254 // initialized by the leader when it realizes the follower needs a snapshot, 255 // and it isn't initialized with the index of the snapshot that is actually 256 // sent by us (out of band), which likely is lower. 257 pr.PendingSnapshot = 0 258 prs[uint64(replicaID)] = pr 259 } 260 } 261 262 const ( 263 truncatableIndexChosenViaCommitIndex = "commit" 264 truncatableIndexChosenViaFollowers = "followers" 265 truncatableIndexChosenViaProbingFollower = "probing follower" 266 truncatableIndexChosenViaPendingSnap = "pending snapshot" 267 truncatableIndexChosenViaFirstIndex = "first index" 268 truncatableIndexChosenViaLastIndex = "last index" 269 ) 270 271 type truncateDecisionInput struct { 272 RaftStatus raft.Status 273 LogSize, MaxLogSize int64 274 LogSizeTrusted bool // false when LogSize might be off 275 FirstIndex, LastIndex uint64 276 PendingSnapshotIndex uint64 277 } 278 279 func (input truncateDecisionInput) LogTooLarge() bool { 280 return input.LogSize > input.MaxLogSize 281 } 282 283 // truncateDecision describes a truncation decision. 284 // Beware: when extending this struct, be sure to adjust .String() 285 // so that it is guaranteed to not contain any PII or confidential 286 // cluster data. 287 type truncateDecision struct { 288 Input truncateDecisionInput 289 CommitIndex uint64 290 291 NewFirstIndex uint64 // first index of the resulting log after truncation 292 ChosenVia string 293 } 294 295 func (td *truncateDecision) raftSnapshotsForIndex(index uint64) int { 296 var n int 297 for _, p := range td.Input.RaftStatus.Progress { 298 if p.State != tracker.StateReplicate { 299 // If the follower isn't replicating, we can't trust its Match in 300 // the first place. But note that this shouldn't matter in practice 301 // as we already take care to not cut off these followers when 302 // computing the truncate decision. See: 303 _ = truncatableIndexChosenViaProbingFollower // guru ref 304 continue 305 } 306 307 // When a log truncation happens at the "current log index" (i.e. the 308 // most recently committed index), it is often still in flight to the 309 // followers not required for quorum, and it is likely that they won't 310 // need a truncation to catch up. A follower in that state will have a 311 // Match equaling committed-1, but a Next of committed+1 (indicating that 312 // an append at 'committed' is already ongoing). 313 if p.Match < index && p.Next <= index { 314 n++ 315 } 316 } 317 if td.Input.PendingSnapshotIndex != 0 && td.Input.PendingSnapshotIndex < index { 318 n++ 319 } 320 321 return n 322 } 323 324 func (td *truncateDecision) NumNewRaftSnapshots() int { 325 return td.raftSnapshotsForIndex(td.NewFirstIndex) - td.raftSnapshotsForIndex(td.Input.FirstIndex) 326 } 327 328 // String returns a representation for the decision. 329 // It is guaranteed to not return PII or confidential 330 // information from the cluster. 331 func (td *truncateDecision) String() string { 332 var buf strings.Builder 333 _, _ = fmt.Fprintf(&buf, "should truncate: %t [", td.ShouldTruncate()) 334 _, _ = fmt.Fprintf( 335 &buf, 336 "truncate %d entries to first index %d (chosen via: %s)", 337 td.NumTruncatableIndexes(), td.NewFirstIndex, td.ChosenVia, 338 ) 339 if td.Input.LogTooLarge() { 340 _, _ = fmt.Fprintf( 341 &buf, 342 "; log too large (%s > %s)", 343 humanizeutil.IBytes(td.Input.LogSize), 344 humanizeutil.IBytes(td.Input.MaxLogSize), 345 ) 346 } 347 if n := td.NumNewRaftSnapshots(); n > 0 { 348 _, _ = fmt.Fprintf(&buf, "; implies %d Raft snapshot%s", n, util.Pluralize(int64(n))) 349 } 350 if !td.Input.LogSizeTrusted { 351 _, _ = fmt.Fprintf(&buf, "; log size untrusted") 352 } 353 buf.WriteRune(']') 354 355 return buf.String() 356 } 357 358 func (td *truncateDecision) NumTruncatableIndexes() int { 359 if td.NewFirstIndex < td.Input.FirstIndex { 360 return 0 361 } 362 return int(td.NewFirstIndex - td.Input.FirstIndex) 363 } 364 365 func (td *truncateDecision) ShouldTruncate() bool { 366 n := td.NumTruncatableIndexes() 367 return n >= RaftLogQueueStaleThreshold || 368 (n > 0 && td.Input.LogSize >= RaftLogQueueStaleSize) 369 } 370 371 // ProtectIndex attempts to "protect" a position in the log by making sure it's 372 // not truncated away. Specifically it lowers the proposed truncation point 373 // (which will be the new first index after the truncation) to the given index 374 // if it would be truncating at a point past it. If a change is made, the 375 // ChosenVia is updated with the one given. This protection is not guaranteed if 376 // the protected index is outside of the existing [FirstIndex,LastIndex] bounds. 377 func (td *truncateDecision) ProtectIndex(index uint64, chosenVia string) { 378 if td.NewFirstIndex > index { 379 td.NewFirstIndex = index 380 td.ChosenVia = chosenVia 381 } 382 } 383 384 // computeTruncateDecision returns the oldest index that cannot be 385 // truncated. If there is a behind node, we want to keep old raft logs so it 386 // can catch up without having to send a full snapshot. However, if a node down 387 // is down long enough, sending a snapshot is more efficient and we should 388 // truncate the log to the next behind node or the quorum committed index. We 389 // currently truncate when the raft log size is bigger than the range 390 // size. 391 // 392 // Note that when a node is behind we continue to let the raft log build up 393 // instead of truncating to the commit index. Consider what would happen if we 394 // truncated to the commit index whenever a node is behind and thus needs to be 395 // caught up via a snapshot. While we're generating the snapshot, sending it to 396 // the behind node and waiting for it to be applied we would continue to 397 // truncate the log. If the snapshot generation and application takes too long 398 // the behind node will be caught up to a point behind the current first index 399 // and thus require another snapshot, likely entering a never ending loop of 400 // snapshots. See #8629. 401 func computeTruncateDecision(input truncateDecisionInput) truncateDecision { 402 decision := truncateDecision{Input: input} 403 decision.CommitIndex = input.RaftStatus.Commit 404 405 // The last index is most aggressive possible truncation that we could do. 406 // Everything else in this method makes the truncation less aggressive. 407 decision.NewFirstIndex = input.LastIndex 408 decision.ChosenVia = truncatableIndexChosenViaLastIndex 409 410 // Start by trying to truncate at the commit index. Naively, you would expect 411 // LastIndex to never be smaller than the commit index, but 412 // RaftStatus.Progress.Match is updated on the leader when a command is 413 // proposed and in a single replica Raft group this also means that 414 // RaftStatus.Commit is updated at propose time. 415 decision.ProtectIndex(decision.CommitIndex, truncatableIndexChosenViaCommitIndex) 416 417 for _, progress := range input.RaftStatus.Progress { 418 // Snapshots are expensive, so we try our best to avoid truncating past 419 // where a follower is. 420 421 // First, we never truncate off a recently active follower, no matter how 422 // large the log gets. Recently active shares the (currently 10s) constant 423 // as the quota pool, so the quota pool should put a bound on how much the 424 // raft log can grow due to this. 425 // 426 // For live followers which are being probed (i.e. the leader doesn't know 427 // how far they've caught up), the Match index is too large, and so the 428 // quorum index can be, too. We don't want these followers to require a 429 // snapshot since they are most likely going to be caught up very soon (they 430 // respond with the "right index" to the first probe or don't respond, in 431 // which case they should end up as not recently active). But we also don't 432 // know their index, so we can't possible make a truncation decision that 433 // avoids that at this point and make the truncation a no-op. 434 // 435 // The scenario in which this is most relevant is during restores, where we 436 // split off new ranges that rapidly receive very large log entries while 437 // the Raft group is still in a state of discovery (a new leader starts 438 // probing followers at its own last index). Additionally, these ranges will 439 // be split many times over, resulting in a flurry of snapshots with 440 // overlapping bounds that put significant stress on the Raft snapshot 441 // queue. 442 if progress.RecentActive { 443 if progress.State == tracker.StateProbe { 444 decision.ProtectIndex(input.FirstIndex, truncatableIndexChosenViaProbingFollower) 445 } else { 446 decision.ProtectIndex(progress.Match, truncatableIndexChosenViaFollowers) 447 } 448 continue 449 } 450 451 // Second, if the follower has not been recently active, we don't 452 // truncate it off as long as the raft log is not too large. 453 if !input.LogTooLarge() { 454 decision.ProtectIndex(progress.Match, truncatableIndexChosenViaFollowers) 455 } 456 457 // Otherwise, we let it truncate to the committed index. 458 } 459 460 // The pending snapshot index acts as a placeholder for a replica that is 461 // about to be added to the range (or is in Raft recovery). We don't want to 462 // truncate the log in a way that will require that new replica to be caught 463 // up via yet another Raft snapshot. 464 if input.PendingSnapshotIndex > 0 { 465 decision.ProtectIndex(input.PendingSnapshotIndex, truncatableIndexChosenViaPendingSnap) 466 } 467 468 // If new first index dropped below first index, make them equal (resulting 469 // in a no-op). 470 if decision.NewFirstIndex < input.FirstIndex { 471 decision.NewFirstIndex = input.FirstIndex 472 decision.ChosenVia = truncatableIndexChosenViaFirstIndex 473 } 474 475 // We've inherited the unfortunate semantics for {First,Last}Index from 476 // raft.Storage. Specifically, both {First,Last}Index are inclusive, so 477 // there's no way to represent an empty log. The way we've initialized 478 // repl.FirstIndex is to set it to the first index in the possibly-empty log 479 // (TruncatedState.Index + 1), and allowing LastIndex to fall behind it when 480 // the log is empty (TruncatedState.Index). The initialization is done when 481 // minting a new replica from either the truncated state of incoming 482 // snapshot, or using the default initial log index. This makes for the 483 // confusing situation where FirstIndex > LastIndex. We can detect this 484 // special empty log case by comparing checking if 485 // `FirstIndex == LastIndex + 1` (`logEmpty` below). Similar to this, we can 486 // have the case that `FirstIndex = CommitIndex + 1` when there are no 487 // committed entries (which we check for in `noCommittedEntries` below). 488 // Having done that (i.e. if the raft log is not empty, and there are 489 // committed entries), we can assert on the following invariants: 490 // 491 // FirstIndex <= LastIndex (0) 492 // NewFirstIndex >= FirstIndex (1) 493 // NewFirstIndex <= LastIndex (2) 494 // NewFirstIndex <= CommitIndex (3) 495 // 496 // (1) asserts that we're not regressing our FirstIndex 497 // (2) asserts that our we don't truncate past the last index we can 498 // truncate away, and 499 // (3) is similar to (2) in that we assert that we're not truncating past 500 // the last known CommitIndex. 501 // 502 // TODO(irfansharif): We should consider cleaning up this mess around 503 // {First,Last,Commit}Index by using a sentinel value to represent an empty 504 // log (like we do with `invalidLastTerm`). It'd be extra nice if we could 505 // safeguard access by relying on the type system to force callers to 506 // consider the empty case. Something like 507 // https://github.com/nvanbenschoten/optional could help us emulate an 508 // `option<uint64>` type if we care enough. 509 logEmpty := input.FirstIndex == input.LastIndex+1 510 noCommittedEntries := input.FirstIndex == input.RaftStatus.Commit+1 511 512 logIndexValid := logEmpty || 513 (decision.NewFirstIndex >= input.FirstIndex) && (decision.NewFirstIndex <= input.LastIndex) 514 commitIndexValid := noCommittedEntries || 515 (decision.NewFirstIndex <= decision.CommitIndex) 516 valid := logIndexValid && commitIndexValid 517 if !valid { 518 err := fmt.Sprintf("invalid truncation decision: output = %d, input: [%d, %d], commit idx = %d", 519 decision.NewFirstIndex, input.FirstIndex, input.LastIndex, decision.CommitIndex) 520 panic(err) 521 } 522 523 return decision 524 } 525 526 // shouldQueue determines whether a range should be queued for truncating. This 527 // is true only if the replica is the raft leader and if the total number of 528 // the range's raft log's stale entries exceeds RaftLogQueueStaleThreshold. 529 func (rlq *raftLogQueue) shouldQueue( 530 ctx context.Context, now hlc.Timestamp, r *Replica, _ *config.SystemConfig, 531 ) (shouldQ bool, priority float64) { 532 decision, err := newTruncateDecision(ctx, r) 533 if err != nil { 534 log.Warningf(ctx, "%v", err) 535 return false, 0 536 } 537 538 shouldQ, _, prio := rlq.shouldQueueImpl(ctx, decision) 539 return shouldQ, prio 540 } 541 542 // shouldQueueImpl returns whether the given truncate decision should lead to 543 // a log truncation. This is either the case if the decision says so or if 544 // we want to recompute the log size (in which case `recomputeRaftLogSize` and 545 // `shouldQ` are both true and a reasonable priority is returned). 546 func (rlq *raftLogQueue) shouldQueueImpl( 547 ctx context.Context, decision truncateDecision, 548 ) (shouldQ bool, recomputeRaftLogSize bool, priority float64) { 549 if decision.ShouldTruncate() { 550 return true, !decision.Input.LogSizeTrusted, float64(decision.Input.LogSize) 551 } 552 if decision.Input.LogSizeTrusted || 553 decision.Input.LastIndex == decision.Input.FirstIndex { 554 555 return false, false, 0 556 } 557 // We have a nonempty log (first index != last index) and can't vouch that 558 // the bytes in the log are known. Queue the replica; processing it will 559 // force a recomputation. For the priority, we have to pick one as we 560 // usually use the log size which is not available here. Going half-way 561 // between zero and the MaxLogSize should give a good tradeoff between 562 // processing the recomputation quickly, and not starving replicas which see 563 // a significant amount of write traffic until they run over and truncate 564 // more aggressively than they need to. 565 return true, true, 1.0 + float64(decision.Input.MaxLogSize)/2.0 566 } 567 568 // process truncates the raft log of the range if the replica is the raft 569 // leader and if the total number of the range's raft log's stale entries 570 // exceeds RaftLogQueueStaleThreshold. 571 func (rlq *raftLogQueue) process(ctx context.Context, r *Replica, _ *config.SystemConfig) error { 572 decision, err := newTruncateDecision(ctx, r) 573 if err != nil { 574 return err 575 } 576 577 if _, recompute, _ := rlq.shouldQueueImpl(ctx, decision); recompute { 578 log.VEventf(ctx, 2, "recomputing raft log based on decision %+v", decision) 579 580 // We need to hold raftMu both to access the sideloaded storage and to 581 // make sure concurrent Raft activity doesn't foul up our update to the 582 // cached in-memory values. 583 r.raftMu.Lock() 584 n, err := ComputeRaftLogSize(ctx, r.RangeID, r.Engine(), r.raftMu.sideloaded) 585 if err == nil { 586 r.mu.Lock() 587 r.mu.raftLogSize = n 588 r.mu.raftLogLastCheckSize = n 589 r.mu.raftLogSizeTrusted = true 590 r.mu.Unlock() 591 } 592 r.raftMu.Unlock() 593 594 if err != nil { 595 return errors.Wrap(err, "recomputing raft log size") 596 } 597 598 log.VEventf(ctx, 2, "recomputed raft log size to %s", humanizeutil.IBytes(n)) 599 600 // Override the decision, now that an accurate log size is available. 601 decision, err = newTruncateDecision(ctx, r) 602 if err != nil { 603 return err 604 } 605 } 606 607 // Can and should the raft logs be truncated? 608 if decision.ShouldTruncate() { 609 if n := decision.NumNewRaftSnapshots(); log.V(1) || n > 0 && rlq.logSnapshots.ShouldProcess(timeutil.Now()) { 610 log.Infof(ctx, "%v", log.Safe(decision.String())) 611 } else { 612 log.VEventf(ctx, 1, "%v", log.Safe(decision.String())) 613 } 614 b := &kv.Batch{} 615 b.AddRawRequest(&roachpb.TruncateLogRequest{ 616 RequestHeader: roachpb.RequestHeader{Key: r.Desc().StartKey.AsRawKey()}, 617 Index: decision.NewFirstIndex, 618 RangeID: r.RangeID, 619 }) 620 if err := rlq.db.Run(ctx, b); err != nil { 621 return err 622 } 623 r.store.metrics.RaftLogTruncated.Inc(int64(decision.NumTruncatableIndexes())) 624 } else { 625 log.VEventf(ctx, 3, "%s", log.Safe(decision.String())) 626 } 627 return nil 628 } 629 630 // timer returns interval between processing successive queued truncations. 631 func (*raftLogQueue) timer(_ time.Duration) time.Duration { 632 return raftLogQueueTimerDuration 633 } 634 635 // purgatoryChan returns nil. 636 func (*raftLogQueue) purgatoryChan() <-chan time.Time { 637 return nil 638 } 639 640 var _ sort.Interface = uint64Slice(nil) 641 642 // uint64Slice implements sort.Interface 643 type uint64Slice []uint64 644 645 // Len implements sort.Interface 646 func (a uint64Slice) Len() int { return len(a) } 647 648 // Swap implements sort.Interface 649 func (a uint64Slice) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 650 651 // Less implements sort.Interface 652 func (a uint64Slice) Less(i, j int) bool { return a[i] < a[j] }