github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_raftstorage.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_raftstorage.go (about)

     1  // Copyright 2015 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"math"
    17  	"sync/atomic"
    18  	"time"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/keys"
    21  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
    22  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
    23  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/raftentry"
    24  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/rditer"
    25  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader"
    26  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    27  	"github.com/cockroachdb/cockroach/pkg/storage"
    28  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    29  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    30  	"github.com/cockroachdb/cockroach/pkg/util/log"
    31  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    32  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    33  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    34  	"github.com/cockroachdb/errors"
    35  	"go.etcd.io/etcd/raft"
    36  	"go.etcd.io/etcd/raft/raftpb"
    37  )
    38  
    39  // replicaRaftStorage implements the raft.Storage interface.
    40  type replicaRaftStorage Replica
    41  
    42  var _ raft.Storage = (*replicaRaftStorage)(nil)
    43  
    44  // All calls to raft.RawNode require that both Replica.raftMu and
    45  // Replica.mu are held. All of the functions exposed via the
    46  // raft.Storage interface will in turn be called from RawNode, so none
    47  // of these methods may acquire either lock, but they may require
    48  // their caller to hold one or both locks (even though they do not
    49  // follow our "Locked" naming convention). Specific locking
    50  // requirements are noted in each method's comments.
    51  //
    52  // Many of the methods defined in this file are wrappers around static
    53  // functions. This is done to facilitate their use from
    54  // Replica.Snapshot(), where it is important that all the data that
    55  // goes into the snapshot comes from a consistent view of the
    56  // database, and not the replica's in-memory state or via a reference
    57  // to Replica.store.Engine().
    58  
    59  // InitialState implements the raft.Storage interface.
    60  // InitialState requires that r.mu is held.
    61  func (r *replicaRaftStorage) InitialState() (raftpb.HardState, raftpb.ConfState, error) {
    62  	ctx := r.AnnotateCtx(context.TODO())
    63  	hs, err := r.mu.stateLoader.LoadHardState(ctx, r.store.Engine())
    64  	// For uninitialized ranges, membership is unknown at this point.
    65  	if raft.IsEmptyHardState(hs) || err != nil {
    66  		return raftpb.HardState{}, raftpb.ConfState{}, err
    67  	}
    68  	cs := r.mu.state.Desc.Replicas().ConfState()
    69  	return hs, cs, nil
    70  }
    71  
    72  // Entries implements the raft.Storage interface. Note that maxBytes is advisory
    73  // and this method will always return at least one entry even if it exceeds
    74  // maxBytes. Sideloaded proposals count towards maxBytes with their payloads inlined.
    75  func (r *replicaRaftStorage) Entries(lo, hi, maxBytes uint64) ([]raftpb.Entry, error) {
    76  	readonly := r.store.Engine().NewReadOnly()
    77  	defer readonly.Close()
    78  	ctx := r.AnnotateCtx(context.TODO())
    79  	if r.raftMu.sideloaded == nil {
    80  		return nil, errors.New("sideloaded storage is uninitialized")
    81  	}
    82  	return entries(ctx, r.mu.stateLoader, readonly, r.RangeID, r.store.raftEntryCache,
    83  		r.raftMu.sideloaded, lo, hi, maxBytes)
    84  }
    85  
    86  // raftEntriesLocked requires that r.mu is held.
    87  func (r *Replica) raftEntriesLocked(lo, hi, maxBytes uint64) ([]raftpb.Entry, error) {
    88  	return (*replicaRaftStorage)(r).Entries(lo, hi, maxBytes)
    89  }
    90  
    91  // entries retrieves entries from the engine. To accommodate loading the term,
    92  // `sideloaded` can be supplied as nil, in which case sideloaded entries will
    93  // not be inlined, the raft entry cache will not be populated with *any* of the
    94  // loaded entries, and maxBytes will not be applied to the payloads.
    95  func entries(
    96  	ctx context.Context,
    97  	rsl stateloader.StateLoader,
    98  	reader storage.Reader,
    99  	rangeID roachpb.RangeID,
   100  	eCache *raftentry.Cache,
   101  	sideloaded SideloadStorage,
   102  	lo, hi, maxBytes uint64,
   103  ) ([]raftpb.Entry, error) {
   104  	if lo > hi {
   105  		return nil, errors.Errorf("lo:%d is greater than hi:%d", lo, hi)
   106  	}
   107  
   108  	n := hi - lo
   109  	if n > 100 {
   110  		n = 100
   111  	}
   112  	ents := make([]raftpb.Entry, 0, n)
   113  
   114  	ents, size, hitIndex, exceededMaxBytes := eCache.Scan(ents, rangeID, lo, hi, maxBytes)
   115  
   116  	// Return results if the correct number of results came back or if
   117  	// we ran into the max bytes limit.
   118  	if uint64(len(ents)) == hi-lo || exceededMaxBytes {
   119  		return ents, nil
   120  	}
   121  
   122  	// Scan over the log to find the requested entries in the range [lo, hi),
   123  	// stopping once we have enough.
   124  	expectedIndex := hitIndex
   125  
   126  	// Whether we can populate the Raft entries cache. False if we found a
   127  	// sideloaded proposal, but the caller didn't give us a sideloaded storage.
   128  	canCache := true
   129  
   130  	var ent raftpb.Entry
   131  	scanFunc := func(kv roachpb.KeyValue) (bool, error) {
   132  		if err := kv.Value.GetProto(&ent); err != nil {
   133  			return false, err
   134  		}
   135  		// Exit early if we have any gaps or it has been compacted.
   136  		if ent.Index != expectedIndex {
   137  			return true, nil
   138  		}
   139  		expectedIndex++
   140  
   141  		if sniffSideloadedRaftCommand(ent.Data) {
   142  			canCache = canCache && sideloaded != nil
   143  			if sideloaded != nil {
   144  				newEnt, err := maybeInlineSideloadedRaftCommand(
   145  					ctx, rangeID, ent, sideloaded, eCache,
   146  				)
   147  				if err != nil {
   148  					return true, err
   149  				}
   150  				if newEnt != nil {
   151  					ent = *newEnt
   152  				}
   153  			}
   154  		}
   155  
   156  		// Note that we track the size of proposals with payloads inlined.
   157  		size += uint64(ent.Size())
   158  		if size > maxBytes {
   159  			exceededMaxBytes = true
   160  			if len(ents) > 0 {
   161  				return exceededMaxBytes, nil
   162  			}
   163  		}
   164  		ents = append(ents, ent)
   165  		return exceededMaxBytes, nil
   166  	}
   167  
   168  	if err := iterateEntries(ctx, reader, rangeID, expectedIndex, hi, scanFunc); err != nil {
   169  		return nil, err
   170  	}
   171  	// Cache the fetched entries, if we may.
   172  	if canCache {
   173  		eCache.Add(rangeID, ents, false /* truncate */)
   174  	}
   175  
   176  	// Did the correct number of results come back? If so, we're all good.
   177  	if uint64(len(ents)) == hi-lo {
   178  		return ents, nil
   179  	}
   180  
   181  	// Did we hit the size limit? If so, return what we have.
   182  	if exceededMaxBytes {
   183  		return ents, nil
   184  	}
   185  
   186  	// Did we get any results at all? Because something went wrong.
   187  	if len(ents) > 0 {
   188  		// Was the lo already truncated?
   189  		if ents[0].Index > lo {
   190  			return nil, raft.ErrCompacted
   191  		}
   192  
   193  		// Was the missing index after the last index?
   194  		lastIndex, err := rsl.LoadLastIndex(ctx, reader)
   195  		if err != nil {
   196  			return nil, err
   197  		}
   198  		if lastIndex <= expectedIndex {
   199  			return nil, raft.ErrUnavailable
   200  		}
   201  
   202  		// We have a gap in the record, if so, return a nasty error.
   203  		return nil, errors.Errorf("there is a gap in the index record between lo:%d and hi:%d at index:%d", lo, hi, expectedIndex)
   204  	}
   205  
   206  	// No results, was it due to unavailability or truncation?
   207  	ts, _, err := rsl.LoadRaftTruncatedState(ctx, reader)
   208  	if err != nil {
   209  		return nil, err
   210  	}
   211  	if ts.Index >= lo {
   212  		// The requested lo index has already been truncated.
   213  		return nil, raft.ErrCompacted
   214  	}
   215  	// The requested lo index does not yet exist.
   216  	return nil, raft.ErrUnavailable
   217  }
   218  
   219  func iterateEntries(
   220  	ctx context.Context,
   221  	reader storage.Reader,
   222  	rangeID roachpb.RangeID,
   223  	lo, hi uint64,
   224  	scanFunc func(roachpb.KeyValue) (bool, error),
   225  ) error {
   226  	_, err := storage.MVCCIterate(
   227  		ctx, reader,
   228  		keys.RaftLogKey(rangeID, lo),
   229  		keys.RaftLogKey(rangeID, hi),
   230  		hlc.Timestamp{},
   231  		storage.MVCCScanOptions{},
   232  		scanFunc,
   233  	)
   234  	return err
   235  }
   236  
   237  // invalidLastTerm is an out-of-band value for r.mu.lastTerm that
   238  // invalidates lastTerm caching and forces retrieval of Term(lastTerm)
   239  // from the raftEntryCache/RocksDB.
   240  const invalidLastTerm = 0
   241  
   242  // Term implements the raft.Storage interface.
   243  func (r *replicaRaftStorage) Term(i uint64) (uint64, error) {
   244  	// TODO(nvanbenschoten): should we set r.mu.lastTerm when
   245  	//   r.mu.lastIndex == i && r.mu.lastTerm == invalidLastTerm?
   246  	if r.mu.lastIndex == i && r.mu.lastTerm != invalidLastTerm {
   247  		return r.mu.lastTerm, nil
   248  	}
   249  	// Try to retrieve the term for the desired entry from the entry cache.
   250  	if e, ok := r.store.raftEntryCache.Get(r.RangeID, i); ok {
   251  		return e.Term, nil
   252  	}
   253  	readonly := r.store.Engine().NewReadOnly()
   254  	defer readonly.Close()
   255  	ctx := r.AnnotateCtx(context.TODO())
   256  	return term(ctx, r.mu.stateLoader, readonly, r.RangeID, r.store.raftEntryCache, i)
   257  }
   258  
   259  // raftTermLocked requires that r.mu is locked for reading.
   260  func (r *Replica) raftTermRLocked(i uint64) (uint64, error) {
   261  	return (*replicaRaftStorage)(r).Term(i)
   262  }
   263  
   264  func term(
   265  	ctx context.Context,
   266  	rsl stateloader.StateLoader,
   267  	reader storage.Reader,
   268  	rangeID roachpb.RangeID,
   269  	eCache *raftentry.Cache,
   270  	i uint64,
   271  ) (uint64, error) {
   272  	// entries() accepts a `nil` sideloaded storage and will skip inlining of
   273  	// sideloaded entries. We only need the term, so this is what we do.
   274  	ents, err := entries(ctx, rsl, reader, rangeID, eCache, nil /* sideloaded */, i, i+1, math.MaxUint64 /* maxBytes */)
   275  	if errors.Is(err, raft.ErrCompacted) {
   276  		ts, _, err := rsl.LoadRaftTruncatedState(ctx, reader)
   277  		if err != nil {
   278  			return 0, err
   279  		}
   280  		if i == ts.Index {
   281  			return ts.Term, nil
   282  		}
   283  		return 0, raft.ErrCompacted
   284  	} else if err != nil {
   285  		return 0, err
   286  	}
   287  	if len(ents) == 0 {
   288  		return 0, nil
   289  	}
   290  	return ents[0].Term, nil
   291  }
   292  
   293  // LastIndex implements the raft.Storage interface.
   294  func (r *replicaRaftStorage) LastIndex() (uint64, error) {
   295  	return r.mu.lastIndex, nil
   296  }
   297  
   298  // raftLastIndexLocked requires that r.mu is held.
   299  func (r *Replica) raftLastIndexLocked() (uint64, error) {
   300  	return (*replicaRaftStorage)(r).LastIndex()
   301  }
   302  
   303  // raftTruncatedStateLocked returns metadata about the log that preceded the
   304  // first current entry. This includes both entries that have been compacted away
   305  // and the dummy entries that make up the starting point of an empty log.
   306  // raftTruncatedStateLocked requires that r.mu is held.
   307  func (r *Replica) raftTruncatedStateLocked(
   308  	ctx context.Context,
   309  ) (roachpb.RaftTruncatedState, error) {
   310  	if r.mu.state.TruncatedState != nil {
   311  		return *r.mu.state.TruncatedState, nil
   312  	}
   313  	ts, _, err := r.mu.stateLoader.LoadRaftTruncatedState(ctx, r.store.Engine())
   314  	if err != nil {
   315  		return ts, err
   316  	}
   317  	if ts.Index != 0 {
   318  		r.mu.state.TruncatedState = &ts
   319  	}
   320  	return ts, nil
   321  }
   322  
   323  // FirstIndex implements the raft.Storage interface.
   324  func (r *replicaRaftStorage) FirstIndex() (uint64, error) {
   325  	ctx := r.AnnotateCtx(context.TODO())
   326  	ts, err := (*Replica)(r).raftTruncatedStateLocked(ctx)
   327  	if err != nil {
   328  		return 0, err
   329  	}
   330  	return ts.Index + 1, nil
   331  }
   332  
   333  // raftFirstIndexLocked requires that r.mu is held.
   334  func (r *Replica) raftFirstIndexLocked() (uint64, error) {
   335  	return (*replicaRaftStorage)(r).FirstIndex()
   336  }
   337  
   338  // GetFirstIndex is the same function as raftFirstIndexLocked but it requires
   339  // that r.mu is not held.
   340  func (r *Replica) GetFirstIndex() (uint64, error) {
   341  	r.mu.Lock()
   342  	defer r.mu.Unlock()
   343  	return r.raftFirstIndexLocked()
   344  }
   345  
   346  // GetLeaseAppliedIndex returns the lease index of the last applied command.
   347  func (r *Replica) GetLeaseAppliedIndex() uint64 {
   348  	r.mu.RLock()
   349  	defer r.mu.RUnlock()
   350  	return r.mu.state.LeaseAppliedIndex
   351  }
   352  
   353  // Snapshot implements the raft.Storage interface. Snapshot requires that
   354  // r.mu is held. Note that the returned snapshot is a placeholder and
   355  // does not contain any of the replica data. The snapshot is actually generated
   356  // (and sent) by the Raft snapshot queue.
   357  func (r *replicaRaftStorage) Snapshot() (raftpb.Snapshot, error) {
   358  	r.mu.AssertHeld()
   359  	appliedIndex := r.mu.state.RaftAppliedIndex
   360  	term, err := r.Term(appliedIndex)
   361  	if err != nil {
   362  		return raftpb.Snapshot{}, err
   363  	}
   364  	return raftpb.Snapshot{
   365  		Metadata: raftpb.SnapshotMetadata{
   366  			Index: appliedIndex,
   367  			Term:  term,
   368  		},
   369  	}, nil
   370  }
   371  
   372  // raftSnapshotLocked requires that r.mu is held.
   373  func (r *Replica) raftSnapshotLocked() (raftpb.Snapshot, error) {
   374  	return (*replicaRaftStorage)(r).Snapshot()
   375  }
   376  
   377  // GetSnapshot returns a snapshot of the replica appropriate for sending to a
   378  // replica. If this method returns without error, callers must eventually call
   379  // OutgoingSnapshot.Close.
   380  func (r *Replica) GetSnapshot(
   381  	ctx context.Context, snapType SnapshotRequest_Type, recipientStore roachpb.StoreID,
   382  ) (_ *OutgoingSnapshot, err error) {
   383  	snapUUID := uuid.MakeV4()
   384  	// Get a snapshot while holding raftMu to make sure we're not seeing "half
   385  	// an AddSSTable" (i.e. a state in which an SSTable has been linked in, but
   386  	// the corresponding Raft command not applied yet).
   387  	r.raftMu.Lock()
   388  	snap := r.store.engine.NewSnapshot()
   389  	r.mu.Lock()
   390  	appliedIndex := r.mu.state.RaftAppliedIndex
   391  	// Cleared when OutgoingSnapshot closes.
   392  	r.addSnapshotLogTruncationConstraintLocked(ctx, snapUUID, appliedIndex, recipientStore)
   393  	r.mu.Unlock()
   394  	r.raftMu.Unlock()
   395  
   396  	release := func() {
   397  		now := timeutil.Now()
   398  		r.completeSnapshotLogTruncationConstraint(ctx, snapUUID, now)
   399  	}
   400  
   401  	defer func() {
   402  		if err != nil {
   403  			release()
   404  			snap.Close()
   405  		}
   406  	}()
   407  
   408  	r.mu.RLock()
   409  	defer r.mu.RUnlock()
   410  	rangeID := r.RangeID
   411  
   412  	startKey := r.mu.state.Desc.StartKey
   413  	ctx, sp := r.AnnotateCtxWithSpan(ctx, "snapshot")
   414  	defer sp.Finish()
   415  
   416  	log.Eventf(ctx, "new engine snapshot for replica %s", r)
   417  
   418  	// Delegate to a static function to make sure that we do not depend
   419  	// on any indirect calls to r.store.Engine() (or other in-memory
   420  	// state of the Replica). Everything must come from the snapshot.
   421  	withSideloaded := func(fn func(SideloadStorage) error) error {
   422  		r.raftMu.Lock()
   423  		defer r.raftMu.Unlock()
   424  		return fn(r.raftMu.sideloaded)
   425  	}
   426  	// NB: We have Replica.mu read-locked, but we need it write-locked in order
   427  	// to use Replica.mu.stateLoader. This call is not performance sensitive, so
   428  	// create a new state loader.
   429  	snapData, err := snapshot(
   430  		ctx, snapUUID, stateloader.Make(rangeID), snapType,
   431  		snap, rangeID, r.store.raftEntryCache, withSideloaded, startKey,
   432  	)
   433  	if err != nil {
   434  		log.Errorf(ctx, "error generating snapshot: %+v", err)
   435  		return nil, err
   436  	}
   437  	snapData.onClose = release
   438  	return &snapData, nil
   439  }
   440  
   441  // OutgoingSnapshot contains the data required to stream a snapshot to a
   442  // recipient. Once one is created, it needs to be closed via Close() to prevent
   443  // resource leakage.
   444  type OutgoingSnapshot struct {
   445  	SnapUUID uuid.UUID
   446  	// The Raft snapshot message to send. Contains SnapUUID as its data.
   447  	RaftSnap raftpb.Snapshot
   448  	// The RocksDB snapshot that will be streamed from.
   449  	EngineSnap storage.Reader
   450  	// The complete range iterator for the snapshot to stream.
   451  	Iter *rditer.ReplicaDataIterator
   452  	// The replica state within the snapshot.
   453  	State kvserverpb.ReplicaState
   454  	// Allows access the the original Replica's sideloaded storage. Note that
   455  	// this isn't a snapshot of the sideloaded storage congruent with EngineSnap
   456  	// or RaftSnap -- a log truncation could have removed files from the
   457  	// sideloaded storage in the meantime.
   458  	WithSideloaded func(func(SideloadStorage) error) error
   459  	RaftEntryCache *raftentry.Cache
   460  	snapType       SnapshotRequest_Type
   461  	onClose        func()
   462  }
   463  
   464  func (s *OutgoingSnapshot) String() string {
   465  	return fmt.Sprintf("%s snapshot %s at applied index %d", s.snapType, s.SnapUUID.Short(), s.State.RaftAppliedIndex)
   466  }
   467  
   468  // Close releases the resources associated with the snapshot.
   469  func (s *OutgoingSnapshot) Close() {
   470  	s.Iter.Close()
   471  	s.EngineSnap.Close()
   472  	if s.onClose != nil {
   473  		s.onClose()
   474  	}
   475  }
   476  
   477  // IncomingSnapshot contains the data for an incoming streaming snapshot message.
   478  type IncomingSnapshot struct {
   479  	SnapUUID uuid.UUID
   480  	// The storage interface for the underlying SSTs.
   481  	SSTStorageScratch *SSTSnapshotStorageScratch
   482  	// The Raft log entries for this snapshot.
   483  	LogEntries [][]byte
   484  	// The replica state at the time the snapshot was generated (never nil).
   485  	State *kvserverpb.ReplicaState
   486  	//
   487  	// When true, this snapshot contains an unreplicated TruncatedState. When
   488  	// false, the TruncatedState is replicated (see the reference below) and the
   489  	// recipient must avoid also writing the unreplicated TruncatedState. The
   490  	// migration to an unreplicated TruncatedState will be carried out during
   491  	// the next log truncation (assuming cluster version is bumped at that
   492  	// point).
   493  	// See the comment on VersionUnreplicatedRaftTruncatedState for details.
   494  	UsesUnreplicatedTruncatedState bool
   495  	snapType                       SnapshotRequest_Type
   496  }
   497  
   498  func (s *IncomingSnapshot) String() string {
   499  	return fmt.Sprintf("%s snapshot %s at applied index %d", s.snapType, s.SnapUUID.Short(), s.State.RaftAppliedIndex)
   500  }
   501  
   502  // snapshot creates an OutgoingSnapshot containing a rocksdb snapshot for the
   503  // given range. Note that snapshot() is called without Replica.raftMu held.
   504  func snapshot(
   505  	ctx context.Context,
   506  	snapUUID uuid.UUID,
   507  	rsl stateloader.StateLoader,
   508  	snapType SnapshotRequest_Type,
   509  	snap storage.Reader,
   510  	rangeID roachpb.RangeID,
   511  	eCache *raftentry.Cache,
   512  	withSideloaded func(func(SideloadStorage) error) error,
   513  	startKey roachpb.RKey,
   514  ) (OutgoingSnapshot, error) {
   515  	var desc roachpb.RangeDescriptor
   516  	// We ignore intents on the range descriptor (consistent=false) because we
   517  	// know they cannot be committed yet; operations that modify range
   518  	// descriptors resolve their own intents when they commit.
   519  	ok, err := storage.MVCCGetProto(ctx, snap, keys.RangeDescriptorKey(startKey),
   520  		hlc.MaxTimestamp, &desc, storage.MVCCGetOptions{Inconsistent: true})
   521  	if err != nil {
   522  		return OutgoingSnapshot{}, errors.Errorf("failed to get desc: %s", err)
   523  	}
   524  	if !ok {
   525  		return OutgoingSnapshot{}, errors.Errorf("couldn't find range descriptor")
   526  	}
   527  
   528  	// Read the range metadata from the snapshot instead of the members
   529  	// of the Range struct because they might be changed concurrently.
   530  	appliedIndex, _, err := rsl.LoadAppliedIndex(ctx, snap)
   531  	if err != nil {
   532  		return OutgoingSnapshot{}, err
   533  	}
   534  
   535  	term, err := term(ctx, rsl, snap, rangeID, eCache, appliedIndex)
   536  	if err != nil {
   537  		return OutgoingSnapshot{}, errors.Errorf("failed to fetch term of %d: %s", appliedIndex, err)
   538  	}
   539  
   540  	state, err := rsl.Load(ctx, snap, &desc)
   541  	if err != nil {
   542  		return OutgoingSnapshot{}, err
   543  	}
   544  
   545  	// Intentionally let this iterator and the snapshot escape so that the
   546  	// streamer can send chunks from it bit by bit.
   547  	iter := rditer.NewReplicaDataIterator(&desc, snap,
   548  		true /* replicatedOnly */, false /* seekEnd */)
   549  
   550  	return OutgoingSnapshot{
   551  		RaftEntryCache: eCache,
   552  		WithSideloaded: withSideloaded,
   553  		EngineSnap:     snap,
   554  		Iter:           iter,
   555  		State:          state,
   556  		SnapUUID:       snapUUID,
   557  		RaftSnap: raftpb.Snapshot{
   558  			Data: snapUUID.GetBytes(),
   559  			Metadata: raftpb.SnapshotMetadata{
   560  				Index: appliedIndex,
   561  				Term:  term,
   562  				// Synthesize our raftpb.ConfState from desc.
   563  				ConfState: desc.Replicas().ConfState(),
   564  			},
   565  		},
   566  		snapType: snapType,
   567  	}, nil
   568  }
   569  
   570  // append the given entries to the raft log. Takes the previous values of
   571  // r.mu.lastIndex, r.mu.lastTerm, and r.mu.raftLogSize, and returns new values.
   572  // We do this rather than modifying them directly because these modifications
   573  // need to be atomic with the commit of the batch. This method requires that
   574  // r.raftMu is held.
   575  //
   576  // append is intentionally oblivious to the existence of sideloaded proposals.
   577  // They are managed by the caller, including cleaning up obsolete on-disk
   578  // payloads in case the log tail is replaced.
   579  //
   580  // NOTE: This method takes a engine.Writer because reads are unnecessary when
   581  // prevLastIndex is 0 and prevLastTerm is invalidLastTerm. In the case where
   582  // reading is necessary (I.E. entries are getting overwritten or deleted), a
   583  // engine.ReadWriter must be passed in.
   584  func (r *Replica) append(
   585  	ctx context.Context,
   586  	writer storage.Writer,
   587  	prevLastIndex uint64,
   588  	prevLastTerm uint64,
   589  	prevRaftLogSize int64,
   590  	entries []raftpb.Entry,
   591  ) (uint64, uint64, int64, error) {
   592  	if len(entries) == 0 {
   593  		return prevLastIndex, prevLastTerm, prevRaftLogSize, nil
   594  	}
   595  	var diff enginepb.MVCCStats
   596  	var value roachpb.Value
   597  	for i := range entries {
   598  		ent := &entries[i]
   599  		key := r.raftMu.stateLoader.RaftLogKey(ent.Index)
   600  
   601  		if err := value.SetProto(ent); err != nil {
   602  			return 0, 0, 0, err
   603  		}
   604  		value.InitChecksum(key)
   605  		var err error
   606  		if ent.Index > prevLastIndex {
   607  			err = storage.MVCCBlindPut(ctx, writer, &diff, key, hlc.Timestamp{}, value, nil /* txn */)
   608  		} else {
   609  			// We type assert `writer` to also be an engine.ReadWriter only in
   610  			// the case where we're replacing existing entries.
   611  			eng, ok := writer.(storage.ReadWriter)
   612  			if !ok {
   613  				panic("expected writer to be a engine.ReadWriter when overwriting log entries")
   614  			}
   615  			err = storage.MVCCPut(ctx, eng, &diff, key, hlc.Timestamp{}, value, nil /* txn */)
   616  		}
   617  		if err != nil {
   618  			return 0, 0, 0, err
   619  		}
   620  	}
   621  
   622  	lastIndex := entries[len(entries)-1].Index
   623  	lastTerm := entries[len(entries)-1].Term
   624  	// Delete any previously appended log entries which never committed.
   625  	if prevLastIndex > 0 {
   626  		// We type assert `writer` to also be an engine.ReadWriter only in the
   627  		// case where we're deleting existing entries.
   628  		eng, ok := writer.(storage.ReadWriter)
   629  		if !ok {
   630  			panic("expected writer to be a engine.ReadWriter when deleting log entries")
   631  		}
   632  		for i := lastIndex + 1; i <= prevLastIndex; i++ {
   633  			// Note that the caller is in charge of deleting any sideloaded payloads
   634  			// (which they must only do *after* the batch has committed).
   635  			err := storage.MVCCDelete(ctx, eng, &diff, r.raftMu.stateLoader.RaftLogKey(i),
   636  				hlc.Timestamp{}, nil /* txn */)
   637  			if err != nil {
   638  				return 0, 0, 0, err
   639  			}
   640  		}
   641  	}
   642  
   643  	raftLogSize := prevRaftLogSize + diff.SysBytes
   644  	return lastIndex, lastTerm, raftLogSize, nil
   645  }
   646  
   647  // updateRangeInfo is called whenever a range is updated by ApplySnapshot
   648  // or is created by range splitting to setup the fields which are
   649  // uninitialized or need updating.
   650  func (r *Replica) updateRangeInfo(desc *roachpb.RangeDescriptor) error {
   651  	// RangeMaxBytes should be updated by looking up Zone Config in two cases:
   652  	// 1. After applying a snapshot, if the zone config was not updated for
   653  	// this key range, then maxBytes of this range will not be updated either.
   654  	// 2. After a new range is created by a split, only copying maxBytes from
   655  	// the original range wont work as the original and new ranges might belong
   656  	// to different zones.
   657  	// Load the system config.
   658  	cfg := r.store.Gossip().GetSystemConfig()
   659  	if cfg == nil {
   660  		// This could be before the system config was ever gossiped,
   661  		// or it expired. Let the gossip callback set the info.
   662  		ctx := r.AnnotateCtx(context.TODO())
   663  		log.Warningf(ctx, "no system config available, cannot determine range MaxBytes")
   664  		return nil
   665  	}
   666  
   667  	// Find zone config for this range.
   668  	zone, err := cfg.GetZoneConfigForKey(desc.StartKey)
   669  	if err != nil {
   670  		return errors.Errorf("%s: failed to lookup zone config: %s", r, err)
   671  	}
   672  
   673  	r.SetZoneConfig(zone)
   674  	return nil
   675  }
   676  
   677  // clearRangeData clears the data associated with a range descriptor. If
   678  // rangeIDLocalOnly is true, then only the range-id local keys are deleted.
   679  // Otherwise, the range-id local keys, range local keys, and user keys are all
   680  // deleted. If mustClearRange is true, ClearRange will always be used to remove
   681  // the keys. Otherwise, ClearRangeWithHeuristic will be used, which chooses
   682  // ClearRange or ClearIterRange depending on how many keys there are in the
   683  // range.
   684  func clearRangeData(
   685  	desc *roachpb.RangeDescriptor,
   686  	reader storage.Reader,
   687  	writer storage.Writer,
   688  	rangeIDLocalOnly bool,
   689  	mustClearRange bool,
   690  ) error {
   691  	var keyRanges []rditer.KeyRange
   692  	if rangeIDLocalOnly {
   693  		keyRanges = []rditer.KeyRange{rditer.MakeRangeIDLocalKeyRange(desc.RangeID, false)}
   694  	} else {
   695  		keyRanges = rditer.MakeAllKeyRanges(desc)
   696  	}
   697  	var clearRangeFn func(storage.Reader, storage.Writer, roachpb.Key, roachpb.Key) error
   698  	if mustClearRange {
   699  		clearRangeFn = func(reader storage.Reader, writer storage.Writer, start, end roachpb.Key) error {
   700  			return writer.ClearRange(storage.MakeMVCCMetadataKey(start), storage.MakeMVCCMetadataKey(end))
   701  		}
   702  	} else {
   703  		clearRangeFn = storage.ClearRangeWithHeuristic
   704  	}
   705  
   706  	for _, keyRange := range keyRanges {
   707  		if err := clearRangeFn(reader, writer, keyRange.Start.Key, keyRange.End.Key); err != nil {
   708  			return err
   709  		}
   710  	}
   711  	return nil
   712  }
   713  
   714  // applySnapshot updates the replica and its store based on the given snapshot
   715  // and associated HardState. All snapshots must pass through Raft for
   716  // correctness, i.e. the parameters to this method must be taken from a
   717  // raft.Ready. Any replicas specified in subsumedRepls will be destroyed
   718  // atomically with the application of the snapshot.
   719  //
   720  // If there is a placeholder associated with r, applySnapshot will remove that
   721  // placeholder from the store if and only if it does not return an error.
   722  //
   723  // This method requires that r.raftMu is held, as well as the raftMus of any
   724  // replicas in subsumedRepls.
   725  //
   726  // TODO(benesch): the way this replica method reaches into its store to update
   727  // replicasByKey is unfortunate, but the fix requires a substantial refactor to
   728  // maintain the necessary synchronization.
   729  func (r *Replica) applySnapshot(
   730  	ctx context.Context,
   731  	inSnap IncomingSnapshot,
   732  	snap raftpb.Snapshot,
   733  	hs raftpb.HardState,
   734  	subsumedRepls []*Replica,
   735  ) (err error) {
   736  	s := *inSnap.State
   737  	if s.Desc.RangeID != r.RangeID {
   738  		log.Fatalf(ctx, "unexpected range ID %d", s.Desc.RangeID)
   739  	}
   740  
   741  	snapType := inSnap.snapType
   742  	defer func() {
   743  		if err == nil {
   744  			switch snapType {
   745  			case SnapshotRequest_RAFT:
   746  				r.store.metrics.RangeSnapshotsNormalApplied.Inc(1)
   747  			case SnapshotRequest_LEARNER:
   748  				r.store.metrics.RangeSnapshotsLearnerApplied.Inc(1)
   749  			}
   750  		}
   751  	}()
   752  
   753  	if raft.IsEmptySnap(snap) {
   754  		// Raft discarded the snapshot, indicating that our local state is
   755  		// already ahead of what the snapshot provides. But we count it for
   756  		// stats (see the defer above).
   757  		//
   758  		// Since we're not returning an error, we're responsible for removing any
   759  		// placeholder that might exist.
   760  		r.store.mu.Lock()
   761  		if r.store.removePlaceholderLocked(ctx, r.RangeID) {
   762  			atomic.AddInt32(&r.store.counts.filledPlaceholders, 1)
   763  		}
   764  		r.store.mu.Unlock()
   765  		return nil
   766  	}
   767  	if raft.IsEmptyHardState(hs) {
   768  		// Raft will never provide an empty HardState if it is providing a
   769  		// nonempty snapshot because we discard snapshots that do not increase
   770  		// the commit index.
   771  		log.Fatalf(ctx, "found empty HardState for non-empty Snapshot %+v", snap)
   772  	}
   773  
   774  	var stats struct {
   775  		// Time to process subsumed replicas.
   776  		subsumedReplicas time.Time
   777  		// Time to ingest SSTs.
   778  		ingestion time.Time
   779  	}
   780  	log.Infof(ctx, "applying %s snapshot [id=%s index=%d]",
   781  		snapType, inSnap.SnapUUID.Short(), snap.Metadata.Index)
   782  	defer func(start time.Time) {
   783  		now := timeutil.Now()
   784  		totalLog := fmt.Sprintf(
   785  			"total=%0.0fms ",
   786  			now.Sub(start).Seconds()*1000,
   787  		)
   788  		var subsumedReplicasLog string
   789  		if len(subsumedRepls) > 0 {
   790  			subsumedReplicasLog = fmt.Sprintf(
   791  				"subsumedReplicas=%d@%0.0fms ",
   792  				len(subsumedRepls),
   793  				stats.subsumedReplicas.Sub(start).Seconds()*1000,
   794  			)
   795  		}
   796  		ingestionLog := fmt.Sprintf(
   797  			"ingestion=%d@%0.0fms ",
   798  			len(inSnap.SSTStorageScratch.SSTs()),
   799  			stats.ingestion.Sub(stats.subsumedReplicas).Seconds()*1000,
   800  		)
   801  		log.Infof(ctx, "applied %s snapshot [%s%s%sid=%s index=%d]",
   802  			snapType, totalLog, subsumedReplicasLog, ingestionLog,
   803  			inSnap.SnapUUID.Short(), snap.Metadata.Index)
   804  	}(timeutil.Now())
   805  
   806  	unreplicatedSSTFile := &storage.MemFile{}
   807  	unreplicatedSST := storage.MakeIngestionSSTWriter(unreplicatedSSTFile)
   808  	defer unreplicatedSST.Close()
   809  
   810  	// Clearing the unreplicated state.
   811  	unreplicatedPrefixKey := keys.MakeRangeIDUnreplicatedPrefix(r.RangeID)
   812  	unreplicatedStart := storage.MakeMVCCMetadataKey(unreplicatedPrefixKey)
   813  	unreplicatedEnd := storage.MakeMVCCMetadataKey(unreplicatedPrefixKey.PrefixEnd())
   814  	if err = unreplicatedSST.ClearRange(unreplicatedStart, unreplicatedEnd); err != nil {
   815  		return errors.Wrapf(err, "error clearing range of unreplicated SST writer")
   816  	}
   817  
   818  	// Update HardState.
   819  	if err := r.raftMu.stateLoader.SetHardState(ctx, &unreplicatedSST, hs); err != nil {
   820  		return errors.Wrapf(err, "unable to write HardState to unreplicated SST writer")
   821  	}
   822  
   823  	// Update Raft entries.
   824  	var lastTerm uint64
   825  	var raftLogSize int64
   826  	if len(inSnap.LogEntries) > 0 {
   827  		logEntries := make([]raftpb.Entry, len(inSnap.LogEntries))
   828  		for i, bytes := range inSnap.LogEntries {
   829  			if err := protoutil.Unmarshal(bytes, &logEntries[i]); err != nil {
   830  				return err
   831  			}
   832  		}
   833  		var sideloadedEntriesSize int64
   834  		var err error
   835  		logEntries, sideloadedEntriesSize, err = r.maybeSideloadEntriesRaftMuLocked(ctx, logEntries)
   836  		if err != nil {
   837  			return err
   838  		}
   839  		raftLogSize += sideloadedEntriesSize
   840  		_, lastTerm, raftLogSize, err = r.append(ctx, &unreplicatedSST, 0, invalidLastTerm, raftLogSize, logEntries)
   841  		if err != nil {
   842  			return err
   843  		}
   844  	} else {
   845  		lastTerm = invalidLastTerm
   846  	}
   847  	r.store.raftEntryCache.Drop(r.RangeID)
   848  
   849  	// Update TruncatedState if it is unreplicated.
   850  	if inSnap.UsesUnreplicatedTruncatedState {
   851  		if err := r.raftMu.stateLoader.SetRaftTruncatedState(
   852  			ctx, &unreplicatedSST, s.TruncatedState,
   853  		); err != nil {
   854  			return errors.Wrapf(err, "unable to write UnreplicatedTruncatedState to unreplicated SST writer")
   855  		}
   856  	}
   857  
   858  	if err := unreplicatedSST.Finish(); err != nil {
   859  		return err
   860  	}
   861  	if unreplicatedSST.DataSize > 0 {
   862  		// TODO(itsbilal): Write to SST directly in unreplicatedSST rather than
   863  		// buffering in a MemFile first.
   864  		if err := inSnap.SSTStorageScratch.WriteSST(ctx, unreplicatedSSTFile.Data()); err != nil {
   865  			return err
   866  		}
   867  	}
   868  
   869  	if s.RaftAppliedIndex != snap.Metadata.Index {
   870  		log.Fatalf(ctx, "snapshot RaftAppliedIndex %d doesn't match its metadata index %d",
   871  			s.RaftAppliedIndex, snap.Metadata.Index)
   872  	}
   873  
   874  	if expLen := s.RaftAppliedIndex - s.TruncatedState.Index; expLen != uint64(len(inSnap.LogEntries)) {
   875  		entriesRange, err := extractRangeFromEntries(inSnap.LogEntries)
   876  		if err != nil {
   877  			return err
   878  		}
   879  
   880  		tag := fmt.Sprintf("r%d_%s", r.RangeID, inSnap.SnapUUID.String())
   881  		dir, err := r.store.checkpoint(ctx, tag)
   882  		if err != nil {
   883  			log.Warningf(ctx, "unable to create checkpoint %s: %+v", dir, err)
   884  		} else {
   885  			log.Warningf(ctx, "created checkpoint %s", dir)
   886  		}
   887  
   888  		log.Fatalf(ctx, "missing log entries in snapshot (%s): got %d entries, expected %d "+
   889  			"(TruncatedState.Index=%d, HardState=%s, LogEntries=%s)",
   890  			inSnap.String(), len(inSnap.LogEntries), expLen, s.TruncatedState.Index,
   891  			hs.String(), entriesRange)
   892  	}
   893  
   894  	// If we're subsuming a replica below, we don't have its last NextReplicaID,
   895  	// nor can we obtain it. That's OK: we can just be conservative and use the
   896  	// maximum possible replica ID. preDestroyRaftMuLocked will write a replica
   897  	// tombstone using this maximum possible replica ID, which would normally be
   898  	// problematic, as it would prevent this store from ever having a new replica
   899  	// of the removed range. In this case, however, it's copacetic, as subsumed
   900  	// ranges _can't_ have new replicas.
   901  	if err := r.clearSubsumedReplicaDiskData(ctx, inSnap.SSTStorageScratch, s.Desc, subsumedRepls, mergedTombstoneReplicaID); err != nil {
   902  		return err
   903  	}
   904  	stats.subsumedReplicas = timeutil.Now()
   905  
   906  	// Ingest all SSTs atomically.
   907  	if fn := r.store.cfg.TestingKnobs.BeforeSnapshotSSTIngestion; fn != nil {
   908  		if err := fn(inSnap, snapType, inSnap.SSTStorageScratch.SSTs()); err != nil {
   909  			return err
   910  		}
   911  	}
   912  	if err := r.store.engine.IngestExternalFiles(ctx, inSnap.SSTStorageScratch.SSTs()); err != nil {
   913  		return errors.Wrapf(err, "while ingesting %s", inSnap.SSTStorageScratch.SSTs())
   914  	}
   915  	stats.ingestion = timeutil.Now()
   916  
   917  	// The on-disk state is now committed, but the corresponding in-memory state
   918  	// has not yet been updated. Any errors past this point must therefore be
   919  	// treated as fatal.
   920  
   921  	if err := r.clearSubsumedReplicaInMemoryData(ctx, subsumedRepls, mergedTombstoneReplicaID); err != nil {
   922  		log.Fatalf(ctx, "failed to clear in-memory data of subsumed replicas while applying snapshot: %+v", err)
   923  	}
   924  
   925  	// Atomically swap the placeholder, if any, for the replica, and update the
   926  	// replica's descriptor.
   927  	r.store.mu.Lock()
   928  	if r.store.removePlaceholderLocked(ctx, r.RangeID) {
   929  		atomic.AddInt32(&r.store.counts.filledPlaceholders, 1)
   930  	}
   931  	r.setDescRaftMuLocked(ctx, s.Desc)
   932  	if err := r.store.maybeMarkReplicaInitializedLocked(ctx, r); err != nil {
   933  		log.Fatalf(ctx, "unable to mark replica initialized while applying snapshot: %+v", err)
   934  	}
   935  	r.store.mu.Unlock()
   936  
   937  	// Invoke the leasePostApply method to ensure we properly initialize the
   938  	// replica according to whether it holds the lease. We allow jumps in the
   939  	// lease sequence because there may be multiple lease changes accounted for
   940  	// in the snapshot.
   941  	r.leasePostApply(ctx, *s.Lease, true /* permitJump */)
   942  
   943  	// Inform the concurrency manager that this replica just applied a snapshot.
   944  	r.concMgr.OnReplicaSnapshotApplied()
   945  
   946  	r.mu.Lock()
   947  	// We set the persisted last index to the last applied index. This is
   948  	// not a correctness issue, but means that we may have just transferred
   949  	// some entries we're about to re-request from the leader and overwrite.
   950  	// However, raft.MultiNode currently expects this behavior, and the
   951  	// performance implications are not likely to be drastic. If our
   952  	// feelings about this ever change, we can add a LastIndex field to
   953  	// raftpb.SnapshotMetadata.
   954  	r.mu.lastIndex = s.RaftAppliedIndex
   955  	r.mu.lastTerm = lastTerm
   956  	r.mu.raftLogSize = raftLogSize
   957  	// Update the store stats for the data in the snapshot.
   958  	r.store.metrics.subtractMVCCStats(*r.mu.state.Stats)
   959  	r.store.metrics.addMVCCStats(*s.Stats)
   960  	// Update the rest of the Raft state. Changes to r.mu.state.Desc must be
   961  	// managed by r.setDescRaftMuLocked and changes to r.mu.state.Lease must be handled
   962  	// by r.leasePostApply, but we called those above, so now it's safe to
   963  	// wholesale replace r.mu.state.
   964  	r.mu.state = s
   965  	// Snapshots typically have fewer log entries than the leaseholder. The next
   966  	// time we hold the lease, recompute the log size before making decisions.
   967  	r.mu.raftLogSizeTrusted = false
   968  	r.assertStateLocked(ctx, r.store.Engine())
   969  	r.mu.Unlock()
   970  
   971  	// The rangefeed processor is listening for the logical ops attached to
   972  	// each raft command. These will be lost during a snapshot, so disconnect
   973  	// the rangefeed, if one exists.
   974  	r.disconnectRangefeedWithReason(
   975  		roachpb.RangeFeedRetryError_REASON_RAFT_SNAPSHOT,
   976  	)
   977  
   978  	// Update the replica's cached byte thresholds. This is a no-op if the system
   979  	// config is not available, in which case we rely on the next gossip update
   980  	// to perform the update.
   981  	if err := r.updateRangeInfo(s.Desc); err != nil {
   982  		log.Fatalf(ctx, "unable to update range info while applying snapshot: %+v", err)
   983  	}
   984  
   985  	return nil
   986  }
   987  
   988  // clearSubsumedReplicaDiskData clears the on disk data of the subsumed
   989  // replicas by creating SSTs with range deletion tombstones. We have to be
   990  // careful here not to have overlapping ranges with the SSTs we have already
   991  // created since that will throw an error while we are ingesting them. This
   992  // method requires that each of the subsumed replicas raftMu is held.
   993  func (r *Replica) clearSubsumedReplicaDiskData(
   994  	ctx context.Context,
   995  	scratch *SSTSnapshotStorageScratch,
   996  	desc *roachpb.RangeDescriptor,
   997  	subsumedRepls []*Replica,
   998  	subsumedNextReplicaID roachpb.ReplicaID,
   999  ) error {
  1000  	getKeyRanges := func(desc *roachpb.RangeDescriptor) [2]rditer.KeyRange {
  1001  		return [...]rditer.KeyRange{
  1002  			rditer.MakeRangeLocalKeyRange(desc),
  1003  			rditer.MakeUserKeyRange(desc),
  1004  		}
  1005  	}
  1006  	keyRanges := getKeyRanges(desc)
  1007  	totalKeyRanges := append([]rditer.KeyRange(nil), keyRanges[:]...)
  1008  	for _, sr := range subsumedRepls {
  1009  		// We have to create an SST for the subsumed replica's range-id local keys.
  1010  		subsumedReplSSTFile := &storage.MemFile{}
  1011  		subsumedReplSST := storage.MakeIngestionSSTWriter(subsumedReplSSTFile)
  1012  		defer subsumedReplSST.Close()
  1013  		// NOTE: We set mustClearRange to true because we are setting
  1014  		// RangeTombstoneKey. Since Clears and Puts need to be done in increasing
  1015  		// order of keys, it is not safe to use ClearRangeIter.
  1016  		if err := sr.preDestroyRaftMuLocked(
  1017  			ctx,
  1018  			r.store.Engine(),
  1019  			&subsumedReplSST,
  1020  			subsumedNextReplicaID,
  1021  			true, /* clearRangeIDLocalOnly */
  1022  			true, /* mustClearRange */
  1023  		); err != nil {
  1024  			subsumedReplSST.Close()
  1025  			return err
  1026  		}
  1027  		if err := subsumedReplSST.Finish(); err != nil {
  1028  			return err
  1029  		}
  1030  		if subsumedReplSST.DataSize > 0 {
  1031  			// TODO(itsbilal): Write to SST directly in subsumedReplSST rather than
  1032  			// buffering in a MemFile first.
  1033  			if err := scratch.WriteSST(ctx, subsumedReplSSTFile.Data()); err != nil {
  1034  				return err
  1035  			}
  1036  		}
  1037  
  1038  		srKeyRanges := getKeyRanges(sr.Desc())
  1039  		// Compute the total key space covered by the current replica and all
  1040  		// subsumed replicas.
  1041  		for i := range srKeyRanges {
  1042  			if srKeyRanges[i].Start.Key.Compare(totalKeyRanges[i].Start.Key) < 0 {
  1043  				totalKeyRanges[i].Start = srKeyRanges[i].Start
  1044  			}
  1045  			if srKeyRanges[i].End.Key.Compare(totalKeyRanges[i].End.Key) > 0 {
  1046  				totalKeyRanges[i].End = srKeyRanges[i].End
  1047  			}
  1048  		}
  1049  	}
  1050  
  1051  	// We might have to create SSTs for the range local keys and user keys
  1052  	// depending on if the subsumed replicas are not fully contained by the
  1053  	// replica in our snapshot. The following is an example to this case
  1054  	// happening.
  1055  	//
  1056  	// a       b       c       d
  1057  	// |---1---|-------2-------|  S1
  1058  	// |---1-------------------|  S2
  1059  	// |---1-----------|---3---|  S3
  1060  	//
  1061  	// Since the merge is the first operation to happen, a follower could be down
  1062  	// before it completes. It is reasonable for a snapshot for r1 from S3 to
  1063  	// subsume both r1 and r2 in S1.
  1064  	for i := range keyRanges {
  1065  		if totalKeyRanges[i].End.Key.Compare(keyRanges[i].End.Key) > 0 {
  1066  			subsumedReplSSTFile := &storage.MemFile{}
  1067  			subsumedReplSST := storage.MakeIngestionSSTWriter(subsumedReplSSTFile)
  1068  			defer subsumedReplSST.Close()
  1069  			if err := storage.ClearRangeWithHeuristic(
  1070  				r.store.Engine(),
  1071  				&subsumedReplSST,
  1072  				keyRanges[i].End.Key,
  1073  				totalKeyRanges[i].End.Key,
  1074  			); err != nil {
  1075  				subsumedReplSST.Close()
  1076  				return err
  1077  			}
  1078  			if err := subsumedReplSST.Finish(); err != nil {
  1079  				return err
  1080  			}
  1081  			if subsumedReplSST.DataSize > 0 {
  1082  				// TODO(itsbilal): Write to SST directly in subsumedReplSST rather than
  1083  				// buffering in a MemFile first.
  1084  				if err := scratch.WriteSST(ctx, subsumedReplSSTFile.Data()); err != nil {
  1085  					return err
  1086  				}
  1087  			}
  1088  		}
  1089  		// The snapshot must never subsume a replica that extends the range of the
  1090  		// replica to the left. This is because splits and merges (the only
  1091  		// operation that change the key bounds) always leave the start key intact.
  1092  		// Extending to the left implies that either we merged "to the left" (we
  1093  		// don't), or that we're applying a snapshot for another range (we don't do
  1094  		// that either). Something is severely wrong for this to happen.
  1095  		if totalKeyRanges[i].Start.Key.Compare(keyRanges[i].Start.Key) < 0 {
  1096  			log.Fatalf(ctx, "subsuming replica to our left; key range: %v; total key range %v",
  1097  				keyRanges[i], totalKeyRanges[i])
  1098  		}
  1099  	}
  1100  	return nil
  1101  }
  1102  
  1103  // clearSubsumedReplicaInMemoryData clears the in-memory data of the subsumed
  1104  // replicas. This method requires that each of the subsumed replicas raftMu is
  1105  // held.
  1106  func (r *Replica) clearSubsumedReplicaInMemoryData(
  1107  	ctx context.Context, subsumedRepls []*Replica, subsumedNextReplicaID roachpb.ReplicaID,
  1108  ) error {
  1109  	for _, sr := range subsumedRepls {
  1110  		// We removed sr's data when we committed the batch. Finish subsumption by
  1111  		// updating the in-memory bookkeping.
  1112  		if err := sr.postDestroyRaftMuLocked(ctx, sr.GetMVCCStats()); err != nil {
  1113  			return err
  1114  		}
  1115  		// We already hold sr's raftMu, so we must call removeReplicaImpl directly.
  1116  		// Note that it's safe to update the store's metadata for sr's removal
  1117  		// separately from updating the store's metadata for r's new descriptor
  1118  		// (i.e., under a different store.mu acquisition). Each store.mu
  1119  		// acquisition leaves the store in a consistent state, and access to the
  1120  		// replicas themselves is protected by their raftMus, which are held from
  1121  		// start to finish.
  1122  		if err := r.store.removeInitializedReplicaRaftMuLocked(ctx, sr, subsumedNextReplicaID, RemoveOptions{
  1123  			DestroyData: false, // data is already destroyed
  1124  		}); err != nil {
  1125  			return err
  1126  		}
  1127  	}
  1128  	return nil
  1129  }
  1130  
  1131  // extractRangeFromEntries returns a string representation of the range of
  1132  // marshaled list of raft log entries in the form of [first-index, last-index].
  1133  // If the list is empty, "[n/a, n/a]" is returned instead.
  1134  func extractRangeFromEntries(logEntries [][]byte) (string, error) {
  1135  	var firstIndex, lastIndex string
  1136  	if len(logEntries) == 0 {
  1137  		firstIndex = "n/a"
  1138  		lastIndex = "n/a"
  1139  	} else {
  1140  		firstAndLastLogEntries := make([]raftpb.Entry, 2)
  1141  		if err := protoutil.Unmarshal(logEntries[0], &firstAndLastLogEntries[0]); err != nil {
  1142  			return "", err
  1143  		}
  1144  		if err := protoutil.Unmarshal(logEntries[len(logEntries)-1], &firstAndLastLogEntries[1]); err != nil {
  1145  			return "", err
  1146  		}
  1147  
  1148  		firstIndex = string(firstAndLastLogEntries[0].Index)
  1149  		lastIndex = string(firstAndLastLogEntries[1].Index)
  1150  	}
  1151  	return fmt.Sprintf("[%s, %s]", firstIndex, lastIndex), nil
  1152  }
  1153  
  1154  type raftCommandEncodingVersion byte
  1155  
  1156  // Raft commands are encoded with a 1-byte version (currently 0 or 1), an 8-byte
  1157  // ID, followed by the payload. This inflexible encoding is used so we can
  1158  // efficiently parse the command id while processing the logs.
  1159  //
  1160  // TODO(bdarnell): is this commandID still appropriate for our needs?
  1161  const (
  1162  	// The initial Raft command version, used for all regular Raft traffic.
  1163  	raftVersionStandard raftCommandEncodingVersion = 0
  1164  	// A proposal containing an SSTable which preferably should be sideloaded
  1165  	// (i.e. not stored in the Raft log wholesale). Can be treated as a regular
  1166  	// proposal when arriving on the wire, but when retrieved from the local
  1167  	// Raft log it necessary to inline the payload first as it has usually
  1168  	// been sideloaded.
  1169  	raftVersionSideloaded raftCommandEncodingVersion = 1
  1170  	// The prescribed length for each command ID.
  1171  	raftCommandIDLen = 8
  1172  	// The prescribed length of each encoded command's prefix.
  1173  	raftCommandPrefixLen = 1 + raftCommandIDLen
  1174  	// The no-split bit is now unused, but we still apply the mask to the first
  1175  	// byte of the command for backward compatibility.
  1176  	//
  1177  	// TODO(tschottdorf): predates v1.0 by a significant margin. Remove.
  1178  	raftCommandNoSplitBit  = 1 << 7
  1179  	raftCommandNoSplitMask = raftCommandNoSplitBit - 1
  1180  )
  1181  
  1182  func encodeRaftCommand(
  1183  	version raftCommandEncodingVersion, commandID kvserverbase.CmdIDKey, command []byte,
  1184  ) []byte {
  1185  	b := make([]byte, raftCommandPrefixLen+len(command))
  1186  	encodeRaftCommandPrefix(b[:raftCommandPrefixLen], version, commandID)
  1187  	copy(b[raftCommandPrefixLen:], command)
  1188  	return b
  1189  }
  1190  
  1191  func encodeRaftCommandPrefix(
  1192  	b []byte, version raftCommandEncodingVersion, commandID kvserverbase.CmdIDKey,
  1193  ) {
  1194  	if len(commandID) != raftCommandIDLen {
  1195  		panic(fmt.Sprintf("invalid command ID length; %d != %d", len(commandID), raftCommandIDLen))
  1196  	}
  1197  	if len(b) != raftCommandPrefixLen {
  1198  		panic(fmt.Sprintf("invalid command prefix length; %d != %d", len(b), raftCommandPrefixLen))
  1199  	}
  1200  	b[0] = byte(version)
  1201  	copy(b[1:], []byte(commandID))
  1202  }
  1203  
  1204  // DecodeRaftCommand splits a raftpb.Entry.Data into its commandID and
  1205  // command portions. The caller is responsible for checking that the data
  1206  // is not empty (which indicates a dummy entry generated by raft rather
  1207  // than a real command). Usage is mostly internal to the storage package
  1208  // but is exported for use by debugging tools.
  1209  func DecodeRaftCommand(data []byte) (kvserverbase.CmdIDKey, []byte) {
  1210  	v := raftCommandEncodingVersion(data[0] & raftCommandNoSplitMask)
  1211  	if v != raftVersionStandard && v != raftVersionSideloaded {
  1212  		panic(fmt.Sprintf("unknown command encoding version %v", data[0]))
  1213  	}
  1214  	return kvserverbase.CmdIDKey(data[1 : 1+raftCommandIDLen]), data[1+raftCommandIDLen:]
  1215  }