github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/raft_snapshot_queue.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/raft_snapshot_queue.go (about)

     1  // Copyright 2015 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  	"time"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/config"
    18  	"github.com/cockroachdb/cockroach/pkg/gossip"
    19  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    20  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    21  	"github.com/cockroachdb/cockroach/pkg/util/log"
    22  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    23  	"github.com/cockroachdb/errors"
    24  	"go.etcd.io/etcd/raft/tracker"
    25  )
    26  
    27  const (
    28  	// raftSnapshotQueueTimerDuration is the duration between Raft snapshot of
    29  	// queued replicas.
    30  	raftSnapshotQueueTimerDuration = 0 // zero duration to process Raft snapshots greedily
    31  
    32  	raftSnapshotPriority float64 = 0
    33  )
    34  
    35  // raftSnapshotQueue manages a queue of replicas which may need to catch a
    36  // replica up with a snapshot to their range.
    37  type raftSnapshotQueue struct {
    38  	*baseQueue
    39  }
    40  
    41  // newRaftSnapshotQueue returns a new instance of raftSnapshotQueue.
    42  func newRaftSnapshotQueue(store *Store, g *gossip.Gossip) *raftSnapshotQueue {
    43  	rq := &raftSnapshotQueue{}
    44  	rq.baseQueue = newBaseQueue(
    45  		"raftsnapshot", rq, store, g,
    46  		queueConfig{
    47  			maxSize: defaultQueueMaxSize,
    48  			// The Raft leader (which sends Raft snapshots) may not be the
    49  			// leaseholder. Operating on a replica without holding the lease is the
    50  			// reason Raft snapshots cannot be performed by the replicateQueue.
    51  			needsLease:           false,
    52  			needsSystemConfig:    false,
    53  			acceptsUnsplitRanges: true,
    54  			processTimeoutFunc:   makeQueueSnapshotTimeoutFunc(recoverySnapshotRate),
    55  			successes:            store.metrics.RaftSnapshotQueueSuccesses,
    56  			failures:             store.metrics.RaftSnapshotQueueFailures,
    57  			pending:              store.metrics.RaftSnapshotQueuePending,
    58  			processingNanos:      store.metrics.RaftSnapshotQueueProcessingNanos,
    59  		},
    60  	)
    61  	return rq
    62  }
    63  
    64  func (rq *raftSnapshotQueue) shouldQueue(
    65  	ctx context.Context, now hlc.Timestamp, repl *Replica, _ *config.SystemConfig,
    66  ) (shouldQ bool, priority float64) {
    67  	// If a follower needs a snapshot, enqueue at the highest priority.
    68  	if status := repl.RaftStatus(); status != nil {
    69  		// raft.Status.Progress is only populated on the Raft group leader.
    70  		for _, p := range status.Progress {
    71  			if p.State == tracker.StateSnapshot {
    72  				if log.V(2) {
    73  					log.Infof(ctx, "raft snapshot needed, enqueuing")
    74  				}
    75  				return true, raftSnapshotPriority
    76  			}
    77  		}
    78  	}
    79  	return false, 0
    80  }
    81  
    82  func (rq *raftSnapshotQueue) process(
    83  	ctx context.Context, repl *Replica, _ *config.SystemConfig,
    84  ) error {
    85  	// If a follower requires a Raft snapshot, perform it.
    86  	if status := repl.RaftStatus(); status != nil {
    87  		// raft.Status.Progress is only populated on the Raft group leader.
    88  		for id, p := range status.Progress {
    89  			if p.State == tracker.StateSnapshot {
    90  				if log.V(1) {
    91  					log.Infof(ctx, "sending raft snapshot")
    92  				}
    93  				if err := rq.processRaftSnapshot(ctx, repl, roachpb.ReplicaID(id)); err != nil {
    94  					return err
    95  				}
    96  			}
    97  		}
    98  	}
    99  	return nil
   100  }
   101  
   102  func (rq *raftSnapshotQueue) processRaftSnapshot(
   103  	ctx context.Context, repl *Replica, id roachpb.ReplicaID,
   104  ) error {
   105  	desc := repl.Desc()
   106  	repDesc, ok := desc.GetReplicaDescriptorByID(id)
   107  	if !ok {
   108  		return errors.Errorf("%s: replica %d not present in %v", repl, id, desc.Replicas())
   109  	}
   110  	snapType := SnapshotRequest_RAFT
   111  
   112  	// A learner replica is either getting a snapshot of type LEARNER by the node
   113  	// that's adding it or it's been orphaned and it's about to be cleaned up by
   114  	// the replicate queue. Either way, no point in also sending it a snapshot of
   115  	// type RAFT.
   116  	if repDesc.GetType() == roachpb.LEARNER {
   117  		if fn := repl.store.cfg.TestingKnobs.ReplicaSkipLearnerSnapshot; fn != nil && fn() {
   118  			return nil
   119  		}
   120  		snapType = SnapshotRequest_LEARNER
   121  		if index := repl.getAndGCSnapshotLogTruncationConstraints(timeutil.Now(), repDesc.StoreID); index > 0 {
   122  			// There is a snapshot being transferred. It's probably a LEARNER snap, so
   123  			// bail for now and try again later.
   124  			err := errors.Errorf(
   125  				"skipping snapshot; replica is likely a learner in the process of being added: %s", repDesc)
   126  			log.Infof(ctx, "%v", err)
   127  			// TODO(dan): This is super brittle and non-obvious. In the common case,
   128  			// this check avoids duplicate work, but in rare cases, we send the
   129  			// learner snap at an index before the one raft wanted here. The raft
   130  			// leader should be able to use logs to get the rest of the way, but it
   131  			// doesn't try. In this case, skipping the raft snapshot would mean that
   132  			// we have to wait for the next scanner cycle of the raft snapshot queue
   133  			// to pick it up again. So, punt the responsibility back to raft by
   134  			// telling it that the snapshot failed. If the learner snap ends up being
   135  			// sufficient, this message will be ignored, but if we hit the case
   136  			// described above, this will cause raft to keep asking for a snap and at
   137  			// some point the snapshot lock above will be released and we'll fall
   138  			// through to the below.
   139  			repl.reportSnapshotStatus(ctx, repDesc.ReplicaID, err)
   140  			return nil
   141  		}
   142  	}
   143  
   144  	err := repl.sendSnapshot(ctx, repDesc, snapType, SnapshotRequest_RECOVERY)
   145  
   146  	// NB: if the snapshot fails because of an overlapping replica on the
   147  	// recipient which is also waiting for a snapshot, the "smart" thing is to
   148  	// send that other snapshot with higher priority. The problem is that the
   149  	// leader for the overlapping range may not be this node. This happens
   150  	// typically during splits and merges when overly aggressive log truncations
   151  	// occur.
   152  	//
   153  	// For splits, the overlapping range will be a replica of the pre-split
   154  	// range that needs a snapshot to catch it up across the split trigger.
   155  	//
   156  	// For merges, the overlapping replicas belong to ranges since subsumed by
   157  	// this range. In particular, there can be many of them if merges apply in
   158  	// rapid succession. The leftmost replica is the most important one to catch
   159  	// up, as it will absorb all of the overlapping replicas when caught up past
   160  	// all of the merges.
   161  	//
   162  	// We're currently not handling this and instead rely on the quota pool to
   163  	// make sure that log truncations won't require snapshots for healthy
   164  	// followers.
   165  	return err
   166  }
   167  
   168  func (*raftSnapshotQueue) timer(_ time.Duration) time.Duration {
   169  	return raftSnapshotQueueTimerDuration
   170  }
   171  
   172  func (rq *raftSnapshotQueue) purgatoryChan() <-chan time.Time {
   173  	return nil
   174  }