github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/split_delay_helper.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"time"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    19  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    20  	"go.etcd.io/etcd/raft"
    21  	"go.etcd.io/etcd/raft/tracker"
    22  )
    23  
    24  type splitDelayHelperI interface {
    25  	RaftStatus(context.Context) (roachpb.RangeID, *raft.Status)
    26  	ProposeEmptyCommand(ctx context.Context)
    27  	NumAttempts() int
    28  	Sleep(context.Context) time.Duration
    29  }
    30  
    31  type splitDelayHelper Replica
    32  
    33  func (sdh *splitDelayHelper) RaftStatus(ctx context.Context) (roachpb.RangeID, *raft.Status) {
    34  	r := (*Replica)(sdh)
    35  	r.mu.RLock()
    36  	raftStatus := r.raftStatusRLocked()
    37  	if raftStatus != nil {
    38  		updateRaftProgressFromActivity(
    39  			ctx, raftStatus.Progress, r.descRLocked().Replicas().All(),
    40  			func(replicaID roachpb.ReplicaID) bool {
    41  				return r.mu.lastUpdateTimes.isFollowerActiveSince(
    42  					ctx, replicaID, timeutil.Now(), r.store.cfg.RangeLeaseActiveDuration())
    43  			},
    44  		)
    45  	}
    46  	r.mu.RUnlock()
    47  	return r.RangeID, raftStatus
    48  }
    49  
    50  func (sdh *splitDelayHelper) ProposeEmptyCommand(ctx context.Context) {
    51  	r := (*Replica)(sdh)
    52  	r.raftMu.Lock()
    53  	_ = r.withRaftGroup(true /* campaignOnWake */, func(rawNode *raft.RawNode) (bool, error) {
    54  		// NB: intentionally ignore the error (which can be ErrProposalDropped
    55  		// when there's an SST inflight).
    56  		data := encodeRaftCommand(raftVersionStandard, makeIDKey(), nil)
    57  		_ = rawNode.Propose(data)
    58  		// NB: we need to unquiesce as the group might be quiesced.
    59  		return true /* unquiesceAndWakeLeader */, nil
    60  	})
    61  	r.raftMu.Unlock()
    62  }
    63  
    64  func (sdh *splitDelayHelper) NumAttempts() int {
    65  	// There is a related mechanism regarding snapshots and splits that is worth
    66  	// pointing out here: Incoming MsgApp (see the _ assignment below) are
    67  	// dropped if they are addressed to uninitialized replicas likely to become
    68  	// initialized via a split trigger. These MsgApp are sent approximately once
    69  	// per heartbeat interval, but sometimes there's an additional delay thanks
    70  	// to having to wait for a GC run. In effect, it shouldn't take more than a
    71  	// small number of heartbeats until the follower leaves probing status, so
    72  	// NumAttempts should at least match that.
    73  	_ = maybeDropMsgApp // guru assignment
    74  	// Snapshots can come up for other reasons and at the end of the day, the
    75  	// delay introduced here needs to make sure that the snapshot queue
    76  	// processes at a higher rate than splits happen, so the number of attempts
    77  	// will typically be much higher than what's suggested by maybeDropMsgApp.
    78  	return (*Replica)(sdh).store.cfg.RaftDelaySplitToSuppressSnapshotTicks
    79  }
    80  
    81  func (sdh *splitDelayHelper) Sleep(ctx context.Context) time.Duration {
    82  	tBegin := timeutil.Now()
    83  
    84  	r := (*Replica)(sdh)
    85  	select {
    86  	case <-time.After(r.store.cfg.RaftTickInterval):
    87  	case <-ctx.Done():
    88  	}
    89  
    90  	return timeutil.Since(tBegin)
    91  }
    92  
    93  func maybeDelaySplitToAvoidSnapshot(ctx context.Context, sdh splitDelayHelperI) string {
    94  	maxDelaySplitToAvoidSnapshotTicks := sdh.NumAttempts()
    95  
    96  	var slept time.Duration
    97  	var extra string
    98  	var succeeded bool
    99  	for ticks := 0; ticks < maxDelaySplitToAvoidSnapshotTicks; ticks++ {
   100  		succeeded = false
   101  		extra = ""
   102  		rangeID, raftStatus := sdh.RaftStatus(ctx)
   103  
   104  		if raftStatus == nil {
   105  			// Don't delay on followers (we don't know when to stop). This case
   106  			// is hit rarely enough to not matter.
   107  			extra += "; not Raft leader"
   108  			succeeded = true
   109  			break
   110  		}
   111  
   112  		done := true
   113  		for replicaID, pr := range raftStatus.Progress {
   114  			if pr.State != tracker.StateReplicate {
   115  				if !pr.RecentActive {
   116  					if ticks == 0 {
   117  						// Having set done = false, we make sure we're not exiting early.
   118  						// This is important because we sometimes need that Raft proposal
   119  						// below to make the followers active as there's no chatter on an
   120  						// idle range. (Note that there's a theoretical race in which the
   121  						// follower becomes inactive again during the sleep, but the
   122  						// inactivity interval is much larger than a tick).
   123  						//
   124  						// Don't do this more than once though: if a follower is down,
   125  						// we don't want to delay splits for it.
   126  						done = false
   127  					}
   128  					extra += fmt.Sprintf("; r%d/%d inactive", rangeID, replicaID)
   129  					continue
   130  				}
   131  				done = false
   132  				extra += fmt.Sprintf("; replica r%d/%d not caught up: %+v", rangeID, replicaID, &pr)
   133  			}
   134  		}
   135  		if done {
   136  			succeeded = true
   137  			break
   138  		}
   139  		// Propose an empty command which works around a Raft bug that can
   140  		// leave a follower in ProgressStateProbe even though it has caught
   141  		// up.
   142  		sdh.ProposeEmptyCommand(ctx)
   143  		slept += sdh.Sleep(ctx)
   144  
   145  		if ctx.Err() != nil {
   146  			return ""
   147  		}
   148  	}
   149  
   150  	if slept != 0 {
   151  		extra += fmt.Sprintf("; delayed split for %.1fs to avoid Raft snapshot", slept.Seconds())
   152  		if !succeeded {
   153  			extra += " (without success)"
   154  		}
   155  	}
   156  
   157  	return extra
   158  }