github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/store_snapshot.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/store_snapshot.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"io"
    17  	"time"
    18  
    19  	"github.com/cockroachdb/cockroach/pkg/base"
    20  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/rditer"
    21  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    22  	"github.com/cockroachdb/cockroach/pkg/settings"
    23  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    24  	"github.com/cockroachdb/cockroach/pkg/storage"
    25  	"github.com/cockroachdb/cockroach/pkg/util/envutil"
    26  	"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
    27  	"github.com/cockroachdb/cockroach/pkg/util/log"
    28  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    29  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    30  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    31  	"github.com/cockroachdb/errors"
    32  	crdberrors "github.com/cockroachdb/errors"
    33  	"go.etcd.io/etcd/raft/raftpb"
    34  	"golang.org/x/time/rate"
    35  )
    36  
    37  const (
    38  	// Messages that provide detail about why a snapshot was rejected.
    39  	snapshotStoreTooFullMsg = "store almost out of disk space"
    40  	snapshotApplySemBusyMsg = "store busy applying snapshots"
    41  	storeDrainingMsg        = "store is draining"
    42  
    43  	// IntersectingSnapshotMsg is part of the error message returned from
    44  	// canApplySnapshotLocked and is exposed here so testing can rely on it.
    45  	IntersectingSnapshotMsg = "snapshot intersects existing range"
    46  )
    47  
    48  // incomingSnapshotStream is the minimal interface on a GRPC stream required
    49  // to receive a snapshot over the network.
    50  type incomingSnapshotStream interface {
    51  	Send(*SnapshotResponse) error
    52  	Recv() (*SnapshotRequest, error)
    53  }
    54  
    55  // outgoingSnapshotStream is the minimal interface on a GRPC stream required
    56  // to send a snapshot over the network.
    57  type outgoingSnapshotStream interface {
    58  	Send(*SnapshotRequest) error
    59  	Recv() (*SnapshotResponse, error)
    60  }
    61  
    62  // snapshotStrategy is an approach to sending and receiving Range snapshots.
    63  // Each implementation corresponds to a SnapshotRequest_Strategy, and it is
    64  // expected that the implementation that matches the Strategy specified in the
    65  // snapshot header will always be used.
    66  type snapshotStrategy interface {
    67  	// Receive streams SnapshotRequests in from the provided stream and
    68  	// constructs an IncomingSnapshot.
    69  	Receive(context.Context, incomingSnapshotStream, SnapshotRequest_Header) (IncomingSnapshot, error)
    70  
    71  	// Send streams SnapshotRequests created from the OutgoingSnapshot in to the
    72  	// provided stream. On nil error, the number of bytes sent is returned.
    73  	Send(context.Context, outgoingSnapshotStream, SnapshotRequest_Header, *OutgoingSnapshot) (int64, error)
    74  
    75  	// Status provides a status report on the work performed during the
    76  	// snapshot. Only valid if the strategy succeeded.
    77  	Status() string
    78  
    79  	// Close cleans up any resources associated with the snapshot strategy.
    80  	Close(context.Context)
    81  }
    82  
    83  func assertStrategy(
    84  	ctx context.Context, header SnapshotRequest_Header, expect SnapshotRequest_Strategy,
    85  ) {
    86  	if header.Strategy != expect {
    87  		log.Fatalf(ctx, "expected strategy %s, found strategy %s", expect, header.Strategy)
    88  	}
    89  }
    90  
    91  // kvBatchSnapshotStrategy is an implementation of snapshotStrategy that streams
    92  // batches of KV pairs in the BatchRepr format.
    93  type kvBatchSnapshotStrategy struct {
    94  	raftCfg *base.RaftConfig
    95  	status  string
    96  
    97  	// The size of the batches of PUT operations to send to the receiver of the
    98  	// snapshot. Only used on the sender side.
    99  	batchSize int64
   100  	// Limiter for sending KV batches. Only used on the sender side.
   101  	limiter *rate.Limiter
   102  	// Only used on the sender side.
   103  	newBatch func() storage.Batch
   104  	// bytesSent is updated in sendBatch and returned from Send(). It does not
   105  	// reflect the log entries sent (which are never sent in newer versions of
   106  	// CRDB, as of VersionUnreplicatedTruncatedState).
   107  	bytesSent int64
   108  
   109  	// The approximate size of the SST chunk to buffer in memory on the receiver
   110  	// before flushing to disk. Only used on the receiver side.
   111  	sstChunkSize int64
   112  	// Only used on the receiver side.
   113  	scratch *SSTSnapshotStorageScratch
   114  }
   115  
   116  // multiSSTWriter is a wrapper around RocksDBSstFileWriter and
   117  // SSTSnapshotStorageScratch that handles chunking SSTs and persisting them to
   118  // disk.
   119  type multiSSTWriter struct {
   120  	scratch   *SSTSnapshotStorageScratch
   121  	currSST   storage.SSTWriter
   122  	keyRanges []rditer.KeyRange
   123  	currRange int
   124  	// The approximate size of the SST chunk to buffer in memory on the receiver
   125  	// before flushing to disk.
   126  	sstChunkSize int64
   127  }
   128  
   129  func newMultiSSTWriter(
   130  	ctx context.Context,
   131  	scratch *SSTSnapshotStorageScratch,
   132  	keyRanges []rditer.KeyRange,
   133  	sstChunkSize int64,
   134  ) (multiSSTWriter, error) {
   135  	msstw := multiSSTWriter{
   136  		scratch:      scratch,
   137  		keyRanges:    keyRanges,
   138  		sstChunkSize: sstChunkSize,
   139  	}
   140  	if err := msstw.initSST(ctx); err != nil {
   141  		return msstw, err
   142  	}
   143  	return msstw, nil
   144  }
   145  
   146  func (msstw *multiSSTWriter) initSST(ctx context.Context) error {
   147  	newSSTFile, err := msstw.scratch.NewFile(ctx, msstw.sstChunkSize)
   148  	if err != nil {
   149  		return errors.Wrap(err, "failed to create new sst file")
   150  	}
   151  	newSST := storage.MakeIngestionSSTWriter(newSSTFile)
   152  	msstw.currSST = newSST
   153  	if err := msstw.currSST.ClearRange(msstw.keyRanges[msstw.currRange].Start, msstw.keyRanges[msstw.currRange].End); err != nil {
   154  		msstw.currSST.Close()
   155  		return errors.Wrap(err, "failed to clear range on sst file writer")
   156  	}
   157  	return nil
   158  }
   159  
   160  func (msstw *multiSSTWriter) finalizeSST(ctx context.Context) error {
   161  	err := msstw.currSST.Finish()
   162  	if err != nil {
   163  		return errors.Wrap(err, "failed to finish sst")
   164  	}
   165  	msstw.currRange++
   166  	msstw.currSST.Close()
   167  	return nil
   168  }
   169  
   170  func (msstw *multiSSTWriter) Put(ctx context.Context, key storage.MVCCKey, value []byte) error {
   171  	for msstw.keyRanges[msstw.currRange].End.Key.Compare(key.Key) <= 0 {
   172  		// Finish the current SST, write to the file, and move to the next key
   173  		// range.
   174  		if err := msstw.finalizeSST(ctx); err != nil {
   175  			return err
   176  		}
   177  		if err := msstw.initSST(ctx); err != nil {
   178  			return err
   179  		}
   180  	}
   181  	if msstw.keyRanges[msstw.currRange].Start.Key.Compare(key.Key) > 0 {
   182  		return crdberrors.AssertionFailedf("client error: expected %s to fall in one of %s", key.Key, msstw.keyRanges)
   183  	}
   184  	if err := msstw.currSST.Put(key, value); err != nil {
   185  		return errors.Wrap(err, "failed to put in sst")
   186  	}
   187  	return nil
   188  }
   189  
   190  func (msstw *multiSSTWriter) Finish(ctx context.Context) error {
   191  	if msstw.currRange < len(msstw.keyRanges) {
   192  		for {
   193  			if err := msstw.finalizeSST(ctx); err != nil {
   194  				return err
   195  			}
   196  			if msstw.currRange >= len(msstw.keyRanges) {
   197  				break
   198  			}
   199  			if err := msstw.initSST(ctx); err != nil {
   200  				return err
   201  			}
   202  		}
   203  	}
   204  	return nil
   205  }
   206  
   207  func (msstw *multiSSTWriter) Close() {
   208  	msstw.currSST.Close()
   209  }
   210  
   211  // Receive implements the snapshotStrategy interface.
   212  //
   213  // NOTE: This function assumes that the key-value pairs are sent in sorted
   214  // order. The key-value pairs are sent in the following sorted order:
   215  //
   216  // 1. Replicated range-id local key range
   217  // 2. Range-local key range
   218  // 3. User key range
   219  func (kvSS *kvBatchSnapshotStrategy) Receive(
   220  	ctx context.Context, stream incomingSnapshotStream, header SnapshotRequest_Header,
   221  ) (IncomingSnapshot, error) {
   222  	assertStrategy(ctx, header, SnapshotRequest_KV_BATCH)
   223  
   224  	// At the moment we'll write at most three SSTs.
   225  	// TODO(jeffreyxiao): Re-evaluate as the default range size grows.
   226  	keyRanges := rditer.MakeReplicatedKeyRanges(header.State.Desc)
   227  	msstw, err := newMultiSSTWriter(ctx, kvSS.scratch, keyRanges, kvSS.sstChunkSize)
   228  	if err != nil {
   229  		return noSnap, err
   230  	}
   231  	defer msstw.Close()
   232  	var logEntries [][]byte
   233  
   234  	for {
   235  		req, err := stream.Recv()
   236  		if err != nil {
   237  			return noSnap, err
   238  		}
   239  		if req.Header != nil {
   240  			err := errors.New("client error: provided a header mid-stream")
   241  			return noSnap, sendSnapshotError(stream, err)
   242  		}
   243  
   244  		if req.KVBatch != nil {
   245  			batchReader, err := storage.NewRocksDBBatchReader(req.KVBatch)
   246  			if err != nil {
   247  				return noSnap, errors.Wrap(err, "failed to decode batch")
   248  			}
   249  			// All operations in the batch are guaranteed to be puts.
   250  			for batchReader.Next() {
   251  				if batchReader.BatchType() != storage.BatchTypeValue {
   252  					return noSnap, crdberrors.AssertionFailedf("expected type %d, found type %d", storage.BatchTypeValue, batchReader.BatchType())
   253  				}
   254  				key, err := batchReader.MVCCKey()
   255  				if err != nil {
   256  					return noSnap, errors.Wrap(err, "failed to decode mvcc key")
   257  				}
   258  				if err := msstw.Put(ctx, key, batchReader.Value()); err != nil {
   259  					return noSnap, err
   260  				}
   261  			}
   262  		}
   263  		if req.LogEntries != nil {
   264  			logEntries = append(logEntries, req.LogEntries...)
   265  		}
   266  		if req.Final {
   267  			// We finished receiving all batches and log entries. It's possible that
   268  			// we did not receive any key-value pairs for some of the key ranges, but
   269  			// we must still construct SSTs with range deletion tombstones to remove
   270  			// the data.
   271  			if err := msstw.Finish(ctx); err != nil {
   272  				return noSnap, err
   273  			}
   274  
   275  			msstw.Close()
   276  
   277  			snapUUID, err := uuid.FromBytes(header.RaftMessageRequest.Message.Snapshot.Data)
   278  			if err != nil {
   279  				err = errors.Wrap(err, "client error: invalid snapshot")
   280  				return noSnap, sendSnapshotError(stream, err)
   281  			}
   282  
   283  			inSnap := IncomingSnapshot{
   284  				UsesUnreplicatedTruncatedState: header.UnreplicatedTruncatedState,
   285  				SnapUUID:                       snapUUID,
   286  				SSTStorageScratch:              kvSS.scratch,
   287  				LogEntries:                     logEntries,
   288  				State:                          &header.State,
   289  				snapType:                       header.Type,
   290  			}
   291  
   292  			expLen := inSnap.State.RaftAppliedIndex - inSnap.State.TruncatedState.Index
   293  			if expLen != uint64(len(logEntries)) {
   294  				// We've received a botched snapshot. We could fatal right here but opt
   295  				// to warn loudly instead, and fatal when applying the snapshot
   296  				// (in Replica.applySnapshot) in order to capture replica hard state.
   297  				log.Warningf(ctx,
   298  					"missing log entries in snapshot (%s): got %d entries, expected %d",
   299  					inSnap.String(), len(logEntries), expLen)
   300  			}
   301  
   302  			kvSS.status = fmt.Sprintf("log entries: %d, ssts: %d", len(logEntries), len(kvSS.scratch.SSTs()))
   303  			return inSnap, nil
   304  		}
   305  	}
   306  }
   307  
   308  // errMalformedSnapshot indicates that the snapshot in question is malformed,
   309  // for e.g. missing raft log entries.
   310  var errMalformedSnapshot = errors.New("malformed snapshot generated")
   311  
   312  // Send implements the snapshotStrategy interface.
   313  func (kvSS *kvBatchSnapshotStrategy) Send(
   314  	ctx context.Context,
   315  	stream outgoingSnapshotStream,
   316  	header SnapshotRequest_Header,
   317  	snap *OutgoingSnapshot,
   318  ) (int64, error) {
   319  	assertStrategy(ctx, header, SnapshotRequest_KV_BATCH)
   320  
   321  	// Iterate over all keys using the provided iterator and stream out batches
   322  	// of key-values.
   323  	n := 0
   324  	var b storage.Batch
   325  	for iter := snap.Iter; ; iter.Next() {
   326  		if ok, err := iter.Valid(); err != nil {
   327  			return 0, err
   328  		} else if !ok {
   329  			break
   330  		}
   331  		key := iter.Key()
   332  		value := iter.Value()
   333  		n++
   334  		if b == nil {
   335  			b = kvSS.newBatch()
   336  		}
   337  		if err := b.Put(key, value); err != nil {
   338  			b.Close()
   339  			return 0, err
   340  		}
   341  
   342  		if int64(b.Len()) >= kvSS.batchSize {
   343  			if err := kvSS.sendBatch(ctx, stream, b); err != nil {
   344  				return 0, err
   345  			}
   346  			b = nil
   347  			// We no longer need the keys and values in the batch we just sent,
   348  			// so reset ReplicaDataIterator's allocator and allow its data to
   349  			// be garbage collected.
   350  			iter.ResetAllocator()
   351  		}
   352  	}
   353  	if b != nil {
   354  		if err := kvSS.sendBatch(ctx, stream, b); err != nil {
   355  			return 0, err
   356  		}
   357  	}
   358  
   359  	// Iterate over the specified range of Raft entries and send them all out
   360  	// together.
   361  	firstIndex := header.State.TruncatedState.Index + 1
   362  	endIndex := snap.RaftSnap.Metadata.Index + 1
   363  	preallocSize := endIndex - firstIndex
   364  	const maxPreallocSize = 1000
   365  	if preallocSize > maxPreallocSize {
   366  		// It's possible for the raft log to become enormous in certain
   367  		// sustained failure conditions. We may bail out of the snapshot
   368  		// process early in scanFunc, but in the worst case this
   369  		// preallocation is enough to run the server out of memory. Limit
   370  		// the size of the buffer we will preallocate.
   371  		preallocSize = maxPreallocSize
   372  	}
   373  	logEntries := make([][]byte, 0, preallocSize)
   374  
   375  	var raftLogBytes int64
   376  	scanFunc := func(kv roachpb.KeyValue) (bool, error) {
   377  		bytes, err := kv.Value.GetBytes()
   378  		if err == nil {
   379  			logEntries = append(logEntries, bytes)
   380  			raftLogBytes += int64(len(bytes))
   381  		}
   382  		return false, err
   383  	}
   384  
   385  	rangeID := header.State.Desc.RangeID
   386  
   387  	if err := iterateEntries(ctx, snap.EngineSnap, rangeID, firstIndex, endIndex, scanFunc); err != nil {
   388  		return 0, err
   389  	}
   390  
   391  	// The difference between the snapshot index (applied index at the time of
   392  	// snapshot) and the truncated index should equal the number of log entries
   393  	// shipped over.
   394  	expLen := endIndex - firstIndex
   395  	if expLen != uint64(len(logEntries)) {
   396  		// We've generated a botched snapshot. We could fatal right here but opt
   397  		// to warn loudly instead, and fatal at the caller to capture a checkpoint
   398  		// of the underlying storage engine.
   399  		entriesRange, err := extractRangeFromEntries(logEntries)
   400  		if err != nil {
   401  			return 0, err
   402  		}
   403  		log.Warningf(ctx, "missing log entries in snapshot (%s): "+
   404  			"got %d entries, expected %d (TruncatedState.Index=%d, LogEntries=%s)",
   405  			snap.String(), len(logEntries), expLen, snap.State.TruncatedState.Index, entriesRange)
   406  		return 0, errMalformedSnapshot
   407  	}
   408  
   409  	// Inline the payloads for all sideloaded proposals.
   410  	//
   411  	// TODO(tschottdorf): could also send slim proposals and attach sideloaded
   412  	// SSTables directly to the snapshot. Probably the better long-term
   413  	// solution, but let's see if it ever becomes relevant. Snapshots with
   414  	// inlined proposals are hopefully the exception.
   415  	{
   416  		var ent raftpb.Entry
   417  		for i := range logEntries {
   418  			if err := protoutil.Unmarshal(logEntries[i], &ent); err != nil {
   419  				return 0, err
   420  			}
   421  			if !sniffSideloadedRaftCommand(ent.Data) {
   422  				continue
   423  			}
   424  			if err := snap.WithSideloaded(func(ss SideloadStorage) error {
   425  				newEnt, err := maybeInlineSideloadedRaftCommand(
   426  					ctx, rangeID, ent, ss, snap.RaftEntryCache,
   427  				)
   428  				if err != nil {
   429  					return err
   430  				}
   431  				if newEnt != nil {
   432  					ent = *newEnt
   433  				}
   434  				return nil
   435  			}); err != nil {
   436  				if errors.Is(err, errSideloadedFileNotFound) {
   437  					// We're creating the Raft snapshot based on a snapshot of
   438  					// the engine, but the Raft log may since have been
   439  					// truncated and corresponding on-disk sideloaded payloads
   440  					// unlinked. Luckily, we can just abort this snapshot; the
   441  					// caller can retry.
   442  					//
   443  					// TODO(tschottdorf): check how callers handle this. They
   444  					// should simply retry. In some scenarios, perhaps this can
   445  					// happen repeatedly and prevent a snapshot; not sending the
   446  					// log entries wouldn't help, though, and so we'd really
   447  					// need to make sure the entries are always here, for
   448  					// instance by pre-loading them into memory. Or we can make
   449  					// log truncation less aggressive about removing sideloaded
   450  					// files, by delaying trailing file deletion for a bit.
   451  					return 0, &errMustRetrySnapshotDueToTruncation{
   452  						index: ent.Index,
   453  						term:  ent.Term,
   454  					}
   455  				}
   456  				return 0, err
   457  			}
   458  			// TODO(tschottdorf): it should be possible to reuse `logEntries[i]` here.
   459  			var err error
   460  			if logEntries[i], err = protoutil.Marshal(&ent); err != nil {
   461  				return 0, err
   462  			}
   463  		}
   464  	}
   465  	kvSS.status = fmt.Sprintf("kv pairs: %d, log entries: %d", n, len(logEntries))
   466  	if err := stream.Send(&SnapshotRequest{LogEntries: logEntries}); err != nil {
   467  		return 0, err
   468  	}
   469  	return kvSS.bytesSent, nil
   470  }
   471  
   472  func (kvSS *kvBatchSnapshotStrategy) sendBatch(
   473  	ctx context.Context, stream outgoingSnapshotStream, batch storage.Batch,
   474  ) error {
   475  	if err := kvSS.limiter.WaitN(ctx, 1); err != nil {
   476  		return err
   477  	}
   478  	repr := batch.Repr()
   479  	kvSS.batchSize += int64(len(repr))
   480  	batch.Close()
   481  	return stream.Send(&SnapshotRequest{KVBatch: repr})
   482  }
   483  
   484  // Status implements the snapshotStrategy interface.
   485  func (kvSS *kvBatchSnapshotStrategy) Status() string { return kvSS.status }
   486  
   487  // Close implements the snapshotStrategy interface.
   488  func (kvSS *kvBatchSnapshotStrategy) Close(ctx context.Context) {
   489  	if kvSS.scratch != nil {
   490  		// A failure to clean up the storage is benign except that it will leak
   491  		// disk space (which is reclaimed on node restart). It is unexpected
   492  		// though, so log a warning.
   493  		if err := kvSS.scratch.Clear(); err != nil {
   494  			log.Warningf(ctx, "error closing kvBatchSnapshotStrategy: %v", err)
   495  		}
   496  	}
   497  }
   498  
   499  // reserveSnapshot throttles incoming snapshots. The returned closure is used
   500  // to cleanup the reservation and release its resources. A nil cleanup function
   501  // and a non-empty rejectionMessage indicates the reservation was declined.
   502  func (s *Store) reserveSnapshot(
   503  	ctx context.Context, header *SnapshotRequest_Header,
   504  ) (_cleanup func(), _rejectionMsg string, _err error) {
   505  	tBegin := timeutil.Now()
   506  	if header.RangeSize == 0 {
   507  		// Empty snapshots are exempt from rate limits because they're so cheap to
   508  		// apply. This vastly speeds up rebalancing any empty ranges created by a
   509  		// RESTORE or manual SPLIT AT, since it prevents these empty snapshots from
   510  		// getting stuck behind large snapshots managed by the replicate queue.
   511  	} else if header.CanDecline {
   512  		storeDesc, ok := s.cfg.StorePool.getStoreDescriptor(s.StoreID())
   513  		if ok && (!maxCapacityCheck(storeDesc) || header.RangeSize > storeDesc.Capacity.Available) {
   514  			return nil, snapshotStoreTooFullMsg, nil
   515  		}
   516  		select {
   517  		case s.snapshotApplySem <- struct{}{}:
   518  		case <-ctx.Done():
   519  			return nil, "", ctx.Err()
   520  		case <-s.stopper.ShouldStop():
   521  			return nil, "", errors.Errorf("stopped")
   522  		default:
   523  			return nil, snapshotApplySemBusyMsg, nil
   524  		}
   525  	} else {
   526  		select {
   527  		case s.snapshotApplySem <- struct{}{}:
   528  		case <-ctx.Done():
   529  			return nil, "", ctx.Err()
   530  		case <-s.stopper.ShouldStop():
   531  			return nil, "", errors.Errorf("stopped")
   532  		}
   533  	}
   534  
   535  	// The choice here is essentially arbitrary, but with a default range size of 64mb and the
   536  	// Raft snapshot rate limiting of 8mb/s, we expect to spend less than 8s per snapshot.
   537  	// Preemptive snapshots are limited to 2mb/s (by default), so they can take up to 4x longer,
   538  	// but an average range is closer to 32mb, so we expect ~16s for larger preemptive snapshots,
   539  	// which is what we want to log.
   540  	const snapshotReservationWaitWarnThreshold = 13 * time.Second
   541  	if elapsed := timeutil.Since(tBegin); elapsed > snapshotReservationWaitWarnThreshold {
   542  		replDesc, _ := header.State.Desc.GetReplicaDescriptor(s.StoreID())
   543  		log.Infof(
   544  			ctx,
   545  			"waited for %.1fs to acquire snapshot reservation to r%d/%d",
   546  			elapsed.Seconds(),
   547  			header.State.Desc.RangeID,
   548  			replDesc.ReplicaID,
   549  		)
   550  	}
   551  
   552  	s.metrics.ReservedReplicaCount.Inc(1)
   553  	s.metrics.Reserved.Inc(header.RangeSize)
   554  	return func() {
   555  		s.metrics.ReservedReplicaCount.Dec(1)
   556  		s.metrics.Reserved.Dec(header.RangeSize)
   557  		if header.RangeSize != 0 {
   558  			<-s.snapshotApplySem
   559  		}
   560  	}, "", nil
   561  }
   562  
   563  // canApplySnapshotLocked returns (_, nil) if the snapshot can be applied to
   564  // this store's replica (i.e. the snapshot is not from an older incarnation of
   565  // the replica) and a placeholder can be added to the replicasByKey map (if
   566  // necessary). If a placeholder is required, it is returned as the first value.
   567  //
   568  // Both the store mu (and the raft mu for an existing replica if there is one)
   569  // must be held.
   570  func (s *Store) canApplySnapshotLocked(
   571  	ctx context.Context, snapHeader *SnapshotRequest_Header,
   572  ) (*ReplicaPlaceholder, error) {
   573  	if snapHeader.IsPreemptive() {
   574  		return nil, crdberrors.AssertionFailedf(`expected a raft or learner snapshot`)
   575  	}
   576  
   577  	// TODO(tbg): see the comment on desc.Generation for what seems to be a much
   578  	// saner way to handle overlap via generational semantics.
   579  	desc := *snapHeader.State.Desc
   580  
   581  	// First, check for an existing Replica.
   582  	v, ok := s.mu.replicas.Load(
   583  		int64(desc.RangeID),
   584  	)
   585  	if !ok {
   586  		return nil, errors.Errorf("canApplySnapshotLocked requires a replica present")
   587  	}
   588  	existingRepl := (*Replica)(v)
   589  	// The raftMu is held which allows us to use the existing replica as a
   590  	// placeholder when we decide that the snapshot can be applied. As long
   591  	// as the caller releases the raftMu only after feeding the snapshot
   592  	// into the replica, this is safe.
   593  	existingRepl.raftMu.AssertHeld()
   594  
   595  	existingRepl.mu.RLock()
   596  	existingDesc := existingRepl.mu.state.Desc
   597  	existingIsInitialized := existingDesc.IsInitialized()
   598  	existingDestroyStatus := existingRepl.mu.destroyStatus
   599  	existingRepl.mu.RUnlock()
   600  
   601  	if existingIsInitialized {
   602  		// Regular Raft snapshots can't be refused at this point,
   603  		// even if they widen the existing replica. See the comments
   604  		// in Replica.maybeAcquireSnapshotMergeLock for how this is
   605  		// made safe.
   606  		//
   607  		// NB: The snapshot must be intended for this replica as
   608  		// withReplicaForRequest ensures that requests with a non-zero replica
   609  		// id are passed to a replica with a matching id. Given this is not a
   610  		// preemptive snapshot we know that its id must be non-zero.
   611  		return nil, nil
   612  	}
   613  
   614  	// If we are not alive then we should not apply a snapshot as our removal
   615  	// is imminent.
   616  	if existingDestroyStatus.Removed() {
   617  		return nil, existingDestroyStatus.err
   618  	}
   619  
   620  	// We have a key range [desc.StartKey,desc.EndKey) which we want to apply a
   621  	// snapshot for. Is there a conflicting existing placeholder or an
   622  	// overlapping range?
   623  	if err := s.checkSnapshotOverlapLocked(ctx, snapHeader); err != nil {
   624  		return nil, err
   625  	}
   626  
   627  	placeholder := &ReplicaPlaceholder{
   628  		rangeDesc: desc,
   629  	}
   630  	return placeholder, nil
   631  }
   632  
   633  // checkSnapshotOverlapLocked returns an error if the snapshot overlaps an
   634  // existing replica or placeholder. Any replicas that do overlap have a good
   635  // chance of being abandoned, so they're proactively handed to the GC queue .
   636  func (s *Store) checkSnapshotOverlapLocked(
   637  	ctx context.Context, snapHeader *SnapshotRequest_Header,
   638  ) error {
   639  	desc := *snapHeader.State.Desc
   640  
   641  	// NB: this check seems redundant since placeholders are also represented in
   642  	// replicasByKey (and thus returned in getOverlappingKeyRangeLocked).
   643  	if exRng, ok := s.mu.replicaPlaceholders[desc.RangeID]; ok {
   644  		return errors.Errorf("%s: canApplySnapshotLocked: cannot add placeholder, have an existing placeholder %s %v", s, exRng, snapHeader.RaftMessageRequest.FromReplica)
   645  	}
   646  
   647  	// TODO(benesch): consider discovering and GC'ing *all* overlapping ranges,
   648  	// not just the first one that getOverlappingKeyRangeLocked happens to return.
   649  	if exRange := s.getOverlappingKeyRangeLocked(&desc); exRange != nil {
   650  		// We have a conflicting range, so we must block the snapshot.
   651  		// When such a conflict exists, it will be resolved by one range
   652  		// either being split or garbage collected.
   653  		exReplica, err := s.GetReplica(exRange.Desc().RangeID)
   654  		msg := IntersectingSnapshotMsg
   655  		if err != nil {
   656  			log.Warningf(ctx, "unable to look up overlapping replica on %s: %v", exReplica, err)
   657  		} else {
   658  			inactive := func(r *Replica) bool {
   659  				if r.RaftStatus() == nil {
   660  					return true
   661  				}
   662  				// TODO(benesch): this check does detect inactivity on replicas with
   663  				// epoch-based leases. Since the validity of an epoch-based lease is
   664  				// tied to the owning node's liveness, the lease can be valid well after
   665  				// the leader of the range has cut off communication with this replica.
   666  				// Expiration based leases, by contrast, will expire quickly if the
   667  				// leader of the range stops sending this replica heartbeats.
   668  				lease, pendingLease := r.GetLease()
   669  				now := s.Clock().Now()
   670  				return !r.IsLeaseValid(lease, now) &&
   671  					(pendingLease == (roachpb.Lease{}) || !r.IsLeaseValid(pendingLease, now))
   672  			}
   673  			// We unconditionally send this replica through the GC queue. It's
   674  			// reasonably likely that the GC queue will do nothing because the replica
   675  			// needs to split instead, but better to err on the side of queueing too
   676  			// frequently. Blocking Raft snapshots for too long can wedge a cluster,
   677  			// and if the replica does need to be GC'd, this might be the only code
   678  			// path that notices in a timely fashion.
   679  			//
   680  			// We're careful to avoid starving out other replicas in the GC queue by
   681  			// queueing at a low priority unless we can prove that the range is
   682  			// inactive and thus unlikely to be about to process a split.
   683  			gcPriority := replicaGCPriorityDefault
   684  			if inactive(exReplica) {
   685  				gcPriority = replicaGCPrioritySuspect
   686  			}
   687  
   688  			msg += "; initiated GC:"
   689  			s.replicaGCQueue.AddAsync(ctx, exReplica, gcPriority)
   690  		}
   691  		return errors.Errorf("%s %v (incoming %v)", msg, exReplica, snapHeader.State.Desc.RSpan()) // exReplica can be nil
   692  	}
   693  	return nil
   694  }
   695  
   696  // shouldAcceptSnapshotData is an optimization to check whether we should even
   697  // bother to read the data for an incoming snapshot. If the snapshot overlaps an
   698  // existing replica or placeholder, we'd error during application anyway, so do
   699  // it before transferring all the data. This method is a guess and may have
   700  // false positives. If the snapshot should be rejected, an error is returned
   701  // with a description of why. Otherwise, nil means we should accept the
   702  // snapshot.
   703  func (s *Store) shouldAcceptSnapshotData(
   704  	ctx context.Context, snapHeader *SnapshotRequest_Header,
   705  ) error {
   706  	if snapHeader.IsPreemptive() {
   707  		return crdberrors.AssertionFailedf(`expected a raft or learner snapshot`)
   708  	}
   709  	pErr := s.withReplicaForRequest(ctx, &snapHeader.RaftMessageRequest,
   710  		func(ctx context.Context, r *Replica) *roachpb.Error {
   711  			// If the current replica is not initialized then we should accept this
   712  			// snapshot if it doesn't overlap existing ranges.
   713  			if !r.IsInitialized() {
   714  				s.mu.Lock()
   715  				defer s.mu.Unlock()
   716  				return roachpb.NewError(s.checkSnapshotOverlapLocked(ctx, snapHeader))
   717  			}
   718  			// If the current range is initialized then we need to accept this
   719  			// snapshot.
   720  			return nil
   721  		})
   722  	return pErr.GoError()
   723  }
   724  
   725  // receiveSnapshot receives an incoming snapshot via a pre-opened GRPC stream.
   726  func (s *Store) receiveSnapshot(
   727  	ctx context.Context, header *SnapshotRequest_Header, stream incomingSnapshotStream,
   728  ) error {
   729  	if fn := s.cfg.TestingKnobs.ReceiveSnapshot; fn != nil {
   730  		if err := fn(header); err != nil {
   731  			return sendSnapshotError(stream, err)
   732  		}
   733  	}
   734  
   735  	if header.IsPreemptive() {
   736  		return crdberrors.AssertionFailedf(`expected a raft or learner snapshot`)
   737  	}
   738  
   739  	// Defensive check that any snapshot contains this store in the	descriptor.
   740  	storeID := s.StoreID()
   741  	if _, ok := header.State.Desc.GetReplicaDescriptor(storeID); !ok {
   742  		return crdberrors.AssertionFailedf(
   743  			`snapshot of type %s was sent to s%d which did not contain it as a replica: %s`,
   744  			header.Type, storeID, header.State.Desc.Replicas())
   745  	}
   746  
   747  	cleanup, rejectionMsg, err := s.reserveSnapshot(ctx, header)
   748  	if err != nil {
   749  		return err
   750  	}
   751  	if cleanup == nil {
   752  		return stream.Send(&SnapshotResponse{
   753  			Status:  SnapshotResponse_DECLINED,
   754  			Message: rejectionMsg,
   755  		})
   756  	}
   757  	defer cleanup()
   758  
   759  	// Check to see if the snapshot can be applied but don't attempt to add
   760  	// a placeholder here, because we're not holding the replica's raftMu.
   761  	// We'll perform this check again later after receiving the rest of the
   762  	// snapshot data - this is purely an optimization to prevent downloading
   763  	// a snapshot that we know we won't be able to apply.
   764  	if err := s.shouldAcceptSnapshotData(ctx, header); err != nil {
   765  		return sendSnapshotError(stream,
   766  			errors.Wrapf(err, "%s,r%d: cannot apply snapshot", s, header.State.Desc.RangeID),
   767  		)
   768  	}
   769  
   770  	// Determine which snapshot strategy the sender is using to send this
   771  	// snapshot. If we don't know how to handle the specified strategy, return
   772  	// an error.
   773  	var ss snapshotStrategy
   774  	switch header.Strategy {
   775  	case SnapshotRequest_KV_BATCH:
   776  		snapUUID, err := uuid.FromBytes(header.RaftMessageRequest.Message.Snapshot.Data)
   777  		if err != nil {
   778  			err = errors.Wrap(err, "invalid snapshot")
   779  			return sendSnapshotError(stream, err)
   780  		}
   781  
   782  		ss = &kvBatchSnapshotStrategy{
   783  			raftCfg:      &s.cfg.RaftConfig,
   784  			scratch:      s.sstSnapshotStorage.NewScratchSpace(header.State.Desc.RangeID, snapUUID),
   785  			sstChunkSize: snapshotSSTWriteSyncRate.Get(&s.cfg.Settings.SV),
   786  		}
   787  		defer ss.Close(ctx)
   788  	default:
   789  		return sendSnapshotError(stream,
   790  			errors.Errorf("%s,r%d: unknown snapshot strategy: %s",
   791  				s, header.State.Desc.RangeID, header.Strategy),
   792  		)
   793  	}
   794  
   795  	if err := stream.Send(&SnapshotResponse{Status: SnapshotResponse_ACCEPTED}); err != nil {
   796  		return err
   797  	}
   798  	if log.V(2) {
   799  		log.Infof(ctx, "accepted snapshot reservation for r%d", header.State.Desc.RangeID)
   800  	}
   801  
   802  	inSnap, err := ss.Receive(ctx, stream, *header)
   803  	if err != nil {
   804  		return err
   805  	}
   806  	if err := s.processRaftSnapshotRequest(ctx, header, inSnap); err != nil {
   807  		return sendSnapshotError(stream, errors.Wrap(err.GoError(), "failed to apply snapshot"))
   808  	}
   809  
   810  	return stream.Send(&SnapshotResponse{Status: SnapshotResponse_APPLIED})
   811  }
   812  
   813  func sendSnapshotError(stream incomingSnapshotStream, err error) error {
   814  	return stream.Send(&SnapshotResponse{
   815  		Status:  SnapshotResponse_ERROR,
   816  		Message: err.Error(),
   817  	})
   818  }
   819  
   820  // SnapshotStorePool narrows StorePool to make sendSnapshot easier to test.
   821  type SnapshotStorePool interface {
   822  	throttle(reason throttleReason, why string, toStoreID roachpb.StoreID)
   823  }
   824  
   825  // validatePositive is a function to validate that a settings value is positive.
   826  func validatePositive(v int64) error {
   827  	if v <= 0 {
   828  		return errors.Errorf("%d is not positive", v)
   829  	}
   830  	return nil
   831  }
   832  
   833  // rebalanceSnapshotRate is the rate at which preemptive snapshots can be sent.
   834  // This includes snapshots generated for upreplication or for rebalancing.
   835  var rebalanceSnapshotRate = settings.RegisterPublicValidatedByteSizeSetting(
   836  	"kv.snapshot_rebalance.max_rate",
   837  	"the rate limit (bytes/sec) to use for rebalance and upreplication snapshots",
   838  	envutil.EnvOrDefaultBytes("COCKROACH_PREEMPTIVE_SNAPSHOT_RATE", 8<<20),
   839  	validatePositive,
   840  )
   841  
   842  // recoverySnapshotRate is the rate at which Raft-initiated spanshots can be
   843  // sent. Ideally, one would never see a Raft-initiated snapshot; we'd like all
   844  // the snapshots to be preemptive. However, it has proved unfeasible to
   845  // completely get rid of them.
   846  // TODO(tbg): The existence of this rate, separate from rebalanceSnapshotRate,
   847  // does not make a whole lot of sense.
   848  var recoverySnapshotRate = settings.RegisterPublicValidatedByteSizeSetting(
   849  	"kv.snapshot_recovery.max_rate",
   850  	"the rate limit (bytes/sec) to use for recovery snapshots",
   851  	envutil.EnvOrDefaultBytes("COCKROACH_RAFT_SNAPSHOT_RATE", 8<<20),
   852  	validatePositive,
   853  )
   854  
   855  // snapshotSSTWriteSyncRate is the size of chunks to write before fsync-ing.
   856  // The default of 2 MiB was chosen to be in line with the behavior in bulk-io.
   857  // See sstWriteSyncRate.
   858  var snapshotSSTWriteSyncRate = settings.RegisterByteSizeSetting(
   859  	"kv.snapshot_sst.sync_size",
   860  	"threshold after which snapshot SST writes must fsync",
   861  	2<<20, /* 2 MiB */
   862  )
   863  
   864  func snapshotRateLimit(
   865  	st *cluster.Settings, priority SnapshotRequest_Priority,
   866  ) (rate.Limit, error) {
   867  	switch priority {
   868  	case SnapshotRequest_RECOVERY:
   869  		return rate.Limit(recoverySnapshotRate.Get(&st.SV)), nil
   870  	case SnapshotRequest_REBALANCE:
   871  		return rate.Limit(rebalanceSnapshotRate.Get(&st.SV)), nil
   872  	default:
   873  		return 0, errors.Errorf("unknown snapshot priority: %s", priority)
   874  	}
   875  }
   876  
   877  type errMustRetrySnapshotDueToTruncation struct {
   878  	index, term uint64
   879  }
   880  
   881  func (e *errMustRetrySnapshotDueToTruncation) Error() string {
   882  	return fmt.Sprintf(
   883  		"log truncation during snapshot removed sideloaded SSTable at index %d, term %d",
   884  		e.index, e.term,
   885  	)
   886  }
   887  
   888  // sendSnapshot sends an outgoing snapshot via a pre-opened GRPC stream.
   889  func sendSnapshot(
   890  	ctx context.Context,
   891  	raftCfg *base.RaftConfig,
   892  	st *cluster.Settings,
   893  	stream outgoingSnapshotStream,
   894  	storePool SnapshotStorePool,
   895  	header SnapshotRequest_Header,
   896  	snap *OutgoingSnapshot,
   897  	newBatch func() storage.Batch,
   898  	sent func(),
   899  ) error {
   900  	start := timeutil.Now()
   901  	to := header.RaftMessageRequest.ToReplica
   902  	if err := stream.Send(&SnapshotRequest{Header: &header}); err != nil {
   903  		return err
   904  	}
   905  	// Wait until we get a response from the server. The recipient may queue us
   906  	// (only a limited number of snapshots are allowed concurrently) or flat-out
   907  	// reject the snapshot. After the initial message exchange, we'll go and send
   908  	// the actual snapshot (if not rejected).
   909  	resp, err := stream.Recv()
   910  	if err != nil {
   911  		storePool.throttle(throttleFailed, err.Error(), to.StoreID)
   912  		return err
   913  	}
   914  	switch resp.Status {
   915  	case SnapshotResponse_DECLINED:
   916  		if header.CanDecline {
   917  			declinedMsg := "reservation rejected"
   918  			if len(resp.Message) > 0 {
   919  				declinedMsg = resp.Message
   920  			}
   921  			err := &benignError{errors.Errorf("%s: remote declined %s: %s", to, snap, declinedMsg)}
   922  			storePool.throttle(throttleDeclined, err.Error(), to.StoreID)
   923  			return err
   924  		}
   925  		err := errors.Errorf("%s: programming error: remote declined required %s: %s",
   926  			to, snap, resp.Message)
   927  		storePool.throttle(throttleFailed, err.Error(), to.StoreID)
   928  		return err
   929  	case SnapshotResponse_ERROR:
   930  		storePool.throttle(throttleFailed, resp.Message, to.StoreID)
   931  		return errors.Errorf("%s: remote couldn't accept %s with error: %s",
   932  			to, snap, resp.Message)
   933  	case SnapshotResponse_ACCEPTED:
   934  	// This is the response we're expecting. Continue with snapshot sending.
   935  	default:
   936  		err := errors.Errorf("%s: server sent an invalid status while negotiating %s: %s",
   937  			to, snap, resp.Status)
   938  		storePool.throttle(throttleFailed, err.Error(), to.StoreID)
   939  		return err
   940  	}
   941  
   942  	durQueued := timeutil.Since(start)
   943  	start = timeutil.Now()
   944  
   945  	// The size of batches to send. This is the granularity of rate limiting.
   946  	const batchSize = 256 << 10 // 256 KB
   947  	targetRate, err := snapshotRateLimit(st, header.Priority)
   948  	if err != nil {
   949  		return errors.Wrapf(err, "%s", to)
   950  	}
   951  
   952  	// Convert the bytes/sec rate limit to batches/sec.
   953  	//
   954  	// TODO(peter): Using bytes/sec for rate limiting seems more natural but has
   955  	// practical difficulties. We either need to use a very large burst size
   956  	// which seems to disable the rate limiting, or call WaitN in smaller than
   957  	// burst size chunks which caused excessive slowness in testing. Would be
   958  	// nice to figure this out, but the batches/sec rate limit works for now.
   959  	limiter := rate.NewLimiter(targetRate/batchSize, 1 /* burst size */)
   960  
   961  	// Create a snapshotStrategy based on the desired snapshot strategy.
   962  	var ss snapshotStrategy
   963  	switch header.Strategy {
   964  	case SnapshotRequest_KV_BATCH:
   965  		ss = &kvBatchSnapshotStrategy{
   966  			raftCfg:   raftCfg,
   967  			batchSize: batchSize,
   968  			limiter:   limiter,
   969  			newBatch:  newBatch,
   970  		}
   971  	default:
   972  		log.Fatalf(ctx, "unknown snapshot strategy: %s", header.Strategy)
   973  	}
   974  
   975  	numBytesSent, err := ss.Send(ctx, stream, header, snap)
   976  	if err != nil {
   977  		return err
   978  	}
   979  	durSent := timeutil.Since(start)
   980  
   981  	// Notify the sent callback before the final snapshot request is sent so that
   982  	// the snapshots generated metric gets incremented before the snapshot is
   983  	// applied.
   984  	sent()
   985  	if err := stream.Send(&SnapshotRequest{Final: true}); err != nil {
   986  		return err
   987  	}
   988  	log.Infof(
   989  		ctx,
   990  		"streamed %s to %s in %.2fs @ %s/s: %s, rate-limit: %s/s, queued: %.2fs",
   991  		snap,
   992  		to,
   993  		durSent.Seconds(),
   994  		humanizeutil.IBytes(int64(float64(numBytesSent)/durSent.Seconds())),
   995  		ss.Status(),
   996  		humanizeutil.IBytes(int64(targetRate)),
   997  		durQueued.Seconds(),
   998  	)
   999  
  1000  	resp, err = stream.Recv()
  1001  	if err != nil {
  1002  		return errors.Wrapf(err, "%s: remote failed to apply snapshot", to)
  1003  	}
  1004  	// NB: wait for EOF which ensures that all processing on the server side has
  1005  	// completed (such as defers that might be run after the previous message was
  1006  	// received).
  1007  	if unexpectedResp, err := stream.Recv(); err != io.EOF {
  1008  		return errors.Errorf("%s: expected EOF, got resp=%v err=%v", to, unexpectedResp, err)
  1009  	}
  1010  	switch resp.Status {
  1011  	case SnapshotResponse_ERROR:
  1012  		return errors.Errorf("%s: remote failed to apply snapshot for reason %s", to, resp.Message)
  1013  	case SnapshotResponse_APPLIED:
  1014  		return nil
  1015  	default:
  1016  		return errors.Errorf("%s: server sent an invalid status during finalization: %s",
  1017  			to, resp.Status)
  1018  	}
  1019  }