github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_destroy.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 "fmt" 16 "math" 17 18 "github.com/cockroachdb/cockroach/pkg/keys" 19 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" 20 "github.com/cockroachdb/cockroach/pkg/roachpb" 21 "github.com/cockroachdb/cockroach/pkg/storage" 22 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 23 "github.com/cockroachdb/cockroach/pkg/util/hlc" 24 "github.com/cockroachdb/cockroach/pkg/util/log" 25 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 26 ) 27 28 // DestroyReason indicates if a replica is alive, destroyed, corrupted or pending destruction. 29 type DestroyReason int 30 31 const ( 32 // The replica is alive. 33 destroyReasonAlive DestroyReason = iota 34 // The replica has been GCed or is in the process of being synchronously 35 // removed. 36 destroyReasonRemoved 37 // The replica has been merged into its left-hand neighbor, but its left-hand 38 // neighbor hasn't yet subsumed it. 39 destroyReasonMergePending 40 ) 41 42 type destroyStatus struct { 43 reason DestroyReason 44 err error 45 } 46 47 func (s destroyStatus) String() string { 48 return fmt.Sprintf("{%v %d}", s.err, s.reason) 49 } 50 51 func (s *destroyStatus) Set(err error, reason DestroyReason) { 52 s.err = err 53 s.reason = reason 54 } 55 56 // IsAlive returns true when a replica is alive. 57 func (s destroyStatus) IsAlive() bool { 58 return s.reason == destroyReasonAlive 59 } 60 61 // Removed returns whether the replica has been removed. 62 func (s destroyStatus) Removed() bool { 63 return s.reason == destroyReasonRemoved 64 } 65 66 // mergedTombstoneReplicaID is the replica ID written into the tombstone 67 // for replicas which are part of a range which is known to have been merged. 68 // This value should prevent any messages from stale replicas of that range from 69 // ever resurrecting merged replicas. Whenever merging or subsuming a replica we 70 // know new replicas can never be created so this value is used even if we 71 // don't know the current replica ID. 72 const mergedTombstoneReplicaID roachpb.ReplicaID = math.MaxInt32 73 74 func (r *Replica) preDestroyRaftMuLocked( 75 ctx context.Context, 76 reader storage.Reader, 77 writer storage.Writer, 78 nextReplicaID roachpb.ReplicaID, 79 clearRangeIDLocalOnly bool, 80 mustUseClearRange bool, 81 ) error { 82 desc := r.Desc() 83 err := clearRangeData(desc, reader, writer, clearRangeIDLocalOnly, mustUseClearRange) 84 if err != nil { 85 return err 86 } 87 88 // Save a tombstone to ensure that replica IDs never get reused. 89 // 90 // NB: Legacy tombstones (which are in the replicated key space) are wiped 91 // in clearRangeData, but that's OK since we're writing a new one in the same 92 // batch (and in particular, sequenced *after* the wipe). 93 return r.setTombstoneKey(ctx, writer, nextReplicaID) 94 } 95 96 func (r *Replica) postDestroyRaftMuLocked(ctx context.Context, ms enginepb.MVCCStats) error { 97 // Suggest the cleared range to the compactor queue. 98 // 99 // TODO(benesch): we would ideally atomically suggest the compaction with 100 // the deletion of the data itself. 101 if ms != (enginepb.MVCCStats{}) { 102 desc := r.Desc() 103 r.store.compactor.Suggest(ctx, kvserverpb.SuggestedCompaction{ 104 StartKey: roachpb.Key(desc.StartKey), 105 EndKey: roachpb.Key(desc.EndKey), 106 Compaction: kvserverpb.Compaction{ 107 Bytes: ms.Total(), 108 SuggestedAtNanos: timeutil.Now().UnixNano(), 109 }, 110 }) 111 } 112 113 // NB: we need the nil check below because it's possible that we're GC'ing a 114 // Replica without a replicaID, in which case it does not have a sideloaded 115 // storage. 116 // 117 // TODO(tschottdorf): at node startup, we should remove all on-disk 118 // directories belonging to replicas which aren't present. A crash before a 119 // call to postDestroyRaftMuLocked will currently leave the files around 120 // forever. 121 if r.raftMu.sideloaded != nil { 122 return r.raftMu.sideloaded.Clear(ctx) 123 } 124 125 return nil 126 } 127 128 // destroyRaftMuLocked deletes data associated with a replica, leaving a 129 // tombstone. The Replica may not be initialized in which case only the 130 // range ID local data is removed. 131 func (r *Replica) destroyRaftMuLocked(ctx context.Context, nextReplicaID roachpb.ReplicaID) error { 132 startTime := timeutil.Now() 133 134 ms := r.GetMVCCStats() 135 batch := r.Engine().NewWriteOnlyBatch() 136 defer batch.Close() 137 clearRangeIDLocalOnly := !r.IsInitialized() 138 if err := r.preDestroyRaftMuLocked( 139 ctx, 140 r.Engine(), 141 batch, 142 nextReplicaID, 143 clearRangeIDLocalOnly, 144 false, /* mustUseClearRange */ 145 ); err != nil { 146 return err 147 } 148 preTime := timeutil.Now() 149 150 // We need to sync here because we are potentially deleting sideloaded 151 // proposals from the file system next. We could write the tombstone only in 152 // a synchronous batch first and then delete the data alternatively, but 153 // then need to handle the case in which there is both the tombstone and 154 // leftover replica data. 155 if err := batch.Commit(true); err != nil { 156 return err 157 } 158 commitTime := timeutil.Now() 159 160 if err := r.postDestroyRaftMuLocked(ctx, ms); err != nil { 161 return err 162 } 163 if r.IsInitialized() { 164 log.Infof(ctx, "removed %d (%d+%d) keys in %0.0fms [clear=%0.0fms commit=%0.0fms]", 165 ms.KeyCount+ms.SysCount, ms.KeyCount, ms.SysCount, 166 commitTime.Sub(startTime).Seconds()*1000, 167 preTime.Sub(startTime).Seconds()*1000, 168 commitTime.Sub(preTime).Seconds()*1000) 169 } else { 170 log.Infof(ctx, "removed uninitialized range in %0.0fms [clear=%0.0fms commit=%0.0fms]", 171 commitTime.Sub(startTime).Seconds()*1000, 172 preTime.Sub(startTime).Seconds()*1000, 173 commitTime.Sub(preTime).Seconds()*1000) 174 } 175 return nil 176 } 177 178 // disconnectReplicationRaftMuLocked is called when a Replica is being removed. 179 // It cancels all outstanding proposals, closes the proposalQuota if there 180 // is one, and removes the in-memory raft state. 181 func (r *Replica) disconnectReplicationRaftMuLocked(ctx context.Context) { 182 r.raftMu.AssertHeld() 183 r.readOnlyCmdMu.Lock() 184 defer r.readOnlyCmdMu.Unlock() 185 r.mu.Lock() 186 defer r.mu.Unlock() 187 // NB: In the very rare scenario that we're being removed but currently 188 // believe we are the leaseholder and there are more requests waiting for 189 // quota than total quota then failure to close the proposal quota here could 190 // leave those requests stuck forever. 191 if pq := r.mu.proposalQuota; pq != nil { 192 pq.Close("destroyed") 193 } 194 r.mu.proposalBuf.FlushLockedWithoutProposing() 195 for _, p := range r.mu.proposals { 196 r.cleanupFailedProposalLocked(p) 197 // NB: each proposal needs its own version of the error (i.e. don't try to 198 // share the error across proposals). 199 p.finishApplication(ctx, proposalResult{ 200 Err: roachpb.NewError(roachpb.NewAmbiguousResultError("removing replica")), 201 }) 202 } 203 r.mu.internalRaftGroup = nil 204 } 205 206 // setTombstoneKey writes a tombstone to disk to ensure that replica IDs never 207 // get reused. It determines what the minimum next replica ID can be using 208 // the provided nextReplicaID and the Replica's own ID. 209 // 210 // We have to be careful to set the right key, since a replica can be using an 211 // ID that it hasn't yet received a RangeDescriptor for if it receives raft 212 // requests for that replica ID (as seen in #14231). 213 func (r *Replica) setTombstoneKey( 214 ctx context.Context, writer storage.Writer, externalNextReplicaID roachpb.ReplicaID, 215 ) error { 216 r.mu.Lock() 217 nextReplicaID := r.mu.state.Desc.NextReplicaID 218 if nextReplicaID < externalNextReplicaID { 219 nextReplicaID = externalNextReplicaID 220 } 221 if nextReplicaID > r.mu.tombstoneMinReplicaID { 222 r.mu.tombstoneMinReplicaID = nextReplicaID 223 } 224 r.mu.Unlock() 225 return writeTombstoneKey(ctx, writer, r.RangeID, nextReplicaID) 226 } 227 228 func writeTombstoneKey( 229 ctx context.Context, 230 writer storage.Writer, 231 rangeID roachpb.RangeID, 232 nextReplicaID roachpb.ReplicaID, 233 ) error { 234 tombstoneKey := keys.RangeTombstoneKey(rangeID) 235 tombstone := &roachpb.RangeTombstone{ 236 NextReplicaID: nextReplicaID, 237 } 238 // "Blind" because ms == nil and timestamp == hlc.Timestamp{}. 239 return storage.MVCCBlindPutProto(ctx, writer, nil, tombstoneKey, 240 hlc.Timestamp{}, tombstone, nil) 241 }