github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_destroy.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_destroy.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"math"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/keys"
    19  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
    20  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    21  	"github.com/cockroachdb/cockroach/pkg/storage"
    22  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    23  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    24  	"github.com/cockroachdb/cockroach/pkg/util/log"
    25  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    26  )
    27  
    28  // DestroyReason indicates if a replica is alive, destroyed, corrupted or pending destruction.
    29  type DestroyReason int
    30  
    31  const (
    32  	// The replica is alive.
    33  	destroyReasonAlive DestroyReason = iota
    34  	// The replica has been GCed or is in the process of being synchronously
    35  	// removed.
    36  	destroyReasonRemoved
    37  	// The replica has been merged into its left-hand neighbor, but its left-hand
    38  	// neighbor hasn't yet subsumed it.
    39  	destroyReasonMergePending
    40  )
    41  
    42  type destroyStatus struct {
    43  	reason DestroyReason
    44  	err    error
    45  }
    46  
    47  func (s destroyStatus) String() string {
    48  	return fmt.Sprintf("{%v %d}", s.err, s.reason)
    49  }
    50  
    51  func (s *destroyStatus) Set(err error, reason DestroyReason) {
    52  	s.err = err
    53  	s.reason = reason
    54  }
    55  
    56  // IsAlive returns true when a replica is alive.
    57  func (s destroyStatus) IsAlive() bool {
    58  	return s.reason == destroyReasonAlive
    59  }
    60  
    61  // Removed returns whether the replica has been removed.
    62  func (s destroyStatus) Removed() bool {
    63  	return s.reason == destroyReasonRemoved
    64  }
    65  
    66  // mergedTombstoneReplicaID is the replica ID written into the tombstone
    67  // for replicas which are part of a range which is known to have been merged.
    68  // This value should prevent any messages from stale replicas of that range from
    69  // ever resurrecting merged replicas. Whenever merging or subsuming a replica we
    70  // know new replicas can never be created so this value is used even if we
    71  // don't know the current replica ID.
    72  const mergedTombstoneReplicaID roachpb.ReplicaID = math.MaxInt32
    73  
    74  func (r *Replica) preDestroyRaftMuLocked(
    75  	ctx context.Context,
    76  	reader storage.Reader,
    77  	writer storage.Writer,
    78  	nextReplicaID roachpb.ReplicaID,
    79  	clearRangeIDLocalOnly bool,
    80  	mustUseClearRange bool,
    81  ) error {
    82  	desc := r.Desc()
    83  	err := clearRangeData(desc, reader, writer, clearRangeIDLocalOnly, mustUseClearRange)
    84  	if err != nil {
    85  		return err
    86  	}
    87  
    88  	// Save a tombstone to ensure that replica IDs never get reused.
    89  	//
    90  	// NB: Legacy tombstones (which are in the replicated key space) are wiped
    91  	// in clearRangeData, but that's OK since we're writing a new one in the same
    92  	// batch (and in particular, sequenced *after* the wipe).
    93  	return r.setTombstoneKey(ctx, writer, nextReplicaID)
    94  }
    95  
    96  func (r *Replica) postDestroyRaftMuLocked(ctx context.Context, ms enginepb.MVCCStats) error {
    97  	// Suggest the cleared range to the compactor queue.
    98  	//
    99  	// TODO(benesch): we would ideally atomically suggest the compaction with
   100  	// the deletion of the data itself.
   101  	if ms != (enginepb.MVCCStats{}) {
   102  		desc := r.Desc()
   103  		r.store.compactor.Suggest(ctx, kvserverpb.SuggestedCompaction{
   104  			StartKey: roachpb.Key(desc.StartKey),
   105  			EndKey:   roachpb.Key(desc.EndKey),
   106  			Compaction: kvserverpb.Compaction{
   107  				Bytes:            ms.Total(),
   108  				SuggestedAtNanos: timeutil.Now().UnixNano(),
   109  			},
   110  		})
   111  	}
   112  
   113  	// NB: we need the nil check below because it's possible that we're GC'ing a
   114  	// Replica without a replicaID, in which case it does not have a sideloaded
   115  	// storage.
   116  	//
   117  	// TODO(tschottdorf): at node startup, we should remove all on-disk
   118  	// directories belonging to replicas which aren't present. A crash before a
   119  	// call to postDestroyRaftMuLocked will currently leave the files around
   120  	// forever.
   121  	if r.raftMu.sideloaded != nil {
   122  		return r.raftMu.sideloaded.Clear(ctx)
   123  	}
   124  
   125  	return nil
   126  }
   127  
   128  // destroyRaftMuLocked deletes data associated with a replica, leaving a
   129  // tombstone. The Replica may not be initialized in which case only the
   130  // range ID local data is removed.
   131  func (r *Replica) destroyRaftMuLocked(ctx context.Context, nextReplicaID roachpb.ReplicaID) error {
   132  	startTime := timeutil.Now()
   133  
   134  	ms := r.GetMVCCStats()
   135  	batch := r.Engine().NewWriteOnlyBatch()
   136  	defer batch.Close()
   137  	clearRangeIDLocalOnly := !r.IsInitialized()
   138  	if err := r.preDestroyRaftMuLocked(
   139  		ctx,
   140  		r.Engine(),
   141  		batch,
   142  		nextReplicaID,
   143  		clearRangeIDLocalOnly,
   144  		false, /* mustUseClearRange */
   145  	); err != nil {
   146  		return err
   147  	}
   148  	preTime := timeutil.Now()
   149  
   150  	// We need to sync here because we are potentially deleting sideloaded
   151  	// proposals from the file system next. We could write the tombstone only in
   152  	// a synchronous batch first and then delete the data alternatively, but
   153  	// then need to handle the case in which there is both the tombstone and
   154  	// leftover replica data.
   155  	if err := batch.Commit(true); err != nil {
   156  		return err
   157  	}
   158  	commitTime := timeutil.Now()
   159  
   160  	if err := r.postDestroyRaftMuLocked(ctx, ms); err != nil {
   161  		return err
   162  	}
   163  	if r.IsInitialized() {
   164  		log.Infof(ctx, "removed %d (%d+%d) keys in %0.0fms [clear=%0.0fms commit=%0.0fms]",
   165  			ms.KeyCount+ms.SysCount, ms.KeyCount, ms.SysCount,
   166  			commitTime.Sub(startTime).Seconds()*1000,
   167  			preTime.Sub(startTime).Seconds()*1000,
   168  			commitTime.Sub(preTime).Seconds()*1000)
   169  	} else {
   170  		log.Infof(ctx, "removed uninitialized range in %0.0fms [clear=%0.0fms commit=%0.0fms]",
   171  			commitTime.Sub(startTime).Seconds()*1000,
   172  			preTime.Sub(startTime).Seconds()*1000,
   173  			commitTime.Sub(preTime).Seconds()*1000)
   174  	}
   175  	return nil
   176  }
   177  
   178  // disconnectReplicationRaftMuLocked is called when a Replica is being removed.
   179  // It cancels all outstanding proposals, closes the proposalQuota if there
   180  // is one, and removes the in-memory raft state.
   181  func (r *Replica) disconnectReplicationRaftMuLocked(ctx context.Context) {
   182  	r.raftMu.AssertHeld()
   183  	r.readOnlyCmdMu.Lock()
   184  	defer r.readOnlyCmdMu.Unlock()
   185  	r.mu.Lock()
   186  	defer r.mu.Unlock()
   187  	// NB: In the very rare scenario that we're being removed but currently
   188  	// believe we are the leaseholder and there are more requests waiting for
   189  	// quota than total quota then failure to close the proposal quota here could
   190  	// leave those requests stuck forever.
   191  	if pq := r.mu.proposalQuota; pq != nil {
   192  		pq.Close("destroyed")
   193  	}
   194  	r.mu.proposalBuf.FlushLockedWithoutProposing()
   195  	for _, p := range r.mu.proposals {
   196  		r.cleanupFailedProposalLocked(p)
   197  		// NB: each proposal needs its own version of the error (i.e. don't try to
   198  		// share the error across proposals).
   199  		p.finishApplication(ctx, proposalResult{
   200  			Err: roachpb.NewError(roachpb.NewAmbiguousResultError("removing replica")),
   201  		})
   202  	}
   203  	r.mu.internalRaftGroup = nil
   204  }
   205  
   206  // setTombstoneKey writes a tombstone to disk to ensure that replica IDs never
   207  // get reused. It determines what the minimum next replica ID can be using
   208  // the provided nextReplicaID and the Replica's own ID.
   209  //
   210  // We have to be careful to set the right key, since a replica can be using an
   211  // ID that it hasn't yet received a RangeDescriptor for if it receives raft
   212  // requests for that replica ID (as seen in #14231).
   213  func (r *Replica) setTombstoneKey(
   214  	ctx context.Context, writer storage.Writer, externalNextReplicaID roachpb.ReplicaID,
   215  ) error {
   216  	r.mu.Lock()
   217  	nextReplicaID := r.mu.state.Desc.NextReplicaID
   218  	if nextReplicaID < externalNextReplicaID {
   219  		nextReplicaID = externalNextReplicaID
   220  	}
   221  	if nextReplicaID > r.mu.tombstoneMinReplicaID {
   222  		r.mu.tombstoneMinReplicaID = nextReplicaID
   223  	}
   224  	r.mu.Unlock()
   225  	return writeTombstoneKey(ctx, writer, r.RangeID, nextReplicaID)
   226  }
   227  
   228  func writeTombstoneKey(
   229  	ctx context.Context,
   230  	writer storage.Writer,
   231  	rangeID roachpb.RangeID,
   232  	nextReplicaID roachpb.ReplicaID,
   233  ) error {
   234  	tombstoneKey := keys.RangeTombstoneKey(rangeID)
   235  	tombstone := &roachpb.RangeTombstone{
   236  		NextReplicaID: nextReplicaID,
   237  	}
   238  	// "Blind" because ms == nil and timestamp == hlc.Timestamp{}.
   239  	return storage.MVCCBlindPutProto(ctx, writer, nil, tombstoneKey,
   240  		hlc.Timestamp{}, tombstone, nil)
   241  }