github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/store_split.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "bytes" 15 "context" 16 17 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader" 18 "github.com/cockroachdb/cockroach/pkg/roachpb" 19 "github.com/cockroachdb/cockroach/pkg/storage" 20 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 21 "github.com/cockroachdb/cockroach/pkg/util/log" 22 "github.com/cockroachdb/errors" 23 "go.etcd.io/etcd/raft" 24 "go.etcd.io/etcd/raft/raftpb" 25 ) 26 27 // splitPreApply is called when the raft command is applied. Any 28 // changes to the given ReadWriter will be written atomically with the 29 // split commit. 30 func splitPreApply( 31 ctx context.Context, readWriter storage.ReadWriter, split roachpb.SplitTrigger, r *Replica, 32 ) { 33 // Sanity check that the store is in the split. 34 // 35 // The exception to that is if the DisableEagerReplicaRemoval testing flag is 36 // enabled. 37 // 38 // TODO(ajwerner): rethink DisableEagerReplicaRemoval and remove this in 39 // 20.1 after there are no more preemptive snapshots. 40 _, hasRightDesc := split.RightDesc.GetReplicaDescriptor(r.StoreID()) 41 _, hasLeftDesc := split.LeftDesc.GetReplicaDescriptor(r.StoreID()) 42 if !hasRightDesc || !hasLeftDesc { 43 log.Fatalf(ctx, "cannot process split on s%s which does not exist in the split: %+v", 44 r.StoreID(), split) 45 } 46 47 // Check on the RHS, we need to ensure that it exists and has a minReplicaID 48 // less than or equal to the replica we're about to initialize. 49 // 50 // The right hand side of the split was already created (and its raftMu 51 // acquired) in Replica.acquireSplitLock. It must be present here if it hasn't 52 // been removed in the meantime (handled below). 53 rightRepl, err := r.store.GetReplica(split.RightDesc.RangeID) 54 if roachpb.IsRangeNotFoundError(err) { 55 // The right hand side we were planning to populate has already been removed. 56 // We handle this below. 57 rightRepl = nil 58 } else if err != nil { 59 log.Fatalf(ctx, "failed to get RHS replica: %v", err) 60 } 61 // Check to see if we know that the RHS has already been removed from this 62 // store at the replica ID implied by the split. 63 if rightRepl == nil || rightRepl.isNewerThanSplit(&split) { 64 // We're in the rare case where we know that the RHS has been removed 65 // and re-added with a higher replica ID (and then maybe removed again). 66 // 67 // To apply the split, we need to "throw away" the data that would belong to 68 // the RHS, i.e. we clear the user data the RHS would have inherited from the 69 // LHS due to the split and additionally clear all of the range ID local state 70 // that the split trigger writes into the RHS. 71 // 72 // We know we've never processed a snapshot for the right range because the 73 // LHS prevents any incoming snapshots until the split has executed (i.e. now). 74 // It is important to preserve the HardState because we might however have 75 // already voted at a higher term. In general this shouldn't happen because 76 // we add learners and then promote them only after we snapshot but we're 77 // going to be extra careful in case future versions of cockroach somehow 78 // promote replicas without ensuring that a snapshot has been received. 79 // 80 // Rather than specifically deleting around the data we want to preserve 81 // we read the HardState to preserve it, clear everything and write back 82 // the HardState and tombstone. Note that we only do this if rightRepl 83 // exists; if it doesn't, there's no Raft state to massage (when rightRepl 84 // was removed, a tombstone was written instead). 85 var hs raftpb.HardState 86 if rightRepl != nil { 87 // Assert that the rightRepl is not initialized. We're about to clear out 88 // the data of the RHS of the split; we cannot have already accepted a 89 // snapshot to initialize this newer RHS. 90 if rightRepl.IsInitialized() { 91 log.Fatalf(ctx, "unexpectedly found initialized newer RHS of split: %v", rightRepl.Desc()) 92 } 93 hs, err = rightRepl.raftMu.stateLoader.LoadHardState(ctx, readWriter) 94 if err != nil { 95 log.Fatalf(ctx, "failed to load hard state for removed rhs: %v", err) 96 } 97 } 98 const rangeIDLocalOnly = false 99 const mustUseClearRange = false 100 if err := clearRangeData(&split.RightDesc, readWriter, readWriter, rangeIDLocalOnly, mustUseClearRange); err != nil { 101 log.Fatalf(ctx, "failed to clear range data for removed rhs: %v", err) 102 } 103 if rightRepl != nil { 104 if err := rightRepl.raftMu.stateLoader.SetHardState(ctx, readWriter, hs); err != nil { 105 log.Fatalf(ctx, "failed to set hard state with 0 commit index for removed rhs: %v", err) 106 } 107 } 108 return 109 } 110 111 // Update the raft HardState with the new Commit value now that the 112 // replica is initialized (combining it with existing or default 113 // Term and Vote). This is the common case. 114 rsl := stateloader.Make(split.RightDesc.RangeID) 115 if err := rsl.SynthesizeRaftState(ctx, readWriter); err != nil { 116 log.Fatalf(ctx, "%v", err) 117 } 118 119 // The initialMaxClosed is assigned to the RHS replica to ensure that 120 // follower reads do not regress following the split. After the split occurs 121 // there will be no information in the closedts subsystem about the newly 122 // minted RHS range from its leaseholder's store. Furthermore, the RHS will 123 // have a lease start time equal to that of the LHS which might be quite 124 // old. This means that timestamps which follow the least StartTime for the 125 // LHS part are below the current closed timestamp for the LHS would no 126 // longer be readable on the RHS after the split. 127 // 128 // It is necessary for correctness that the call to maxClosed used to 129 // determine the current closed timestamp happens during the splitPreApply 130 // so that it uses a LAI that is _before_ the index at which this split is 131 // applied. If it were to refer to a LAI equal to or after the split then 132 // the value of initialMaxClosed might be unsafe. 133 // 134 // Concretely, any closed timestamp based on an LAI that is equal to or 135 // above the split index might be larger than the initial closed timestamp 136 // assigned to the RHS range's initial leaseholder. This is because the LHS 137 // range's leaseholder could continue closing out timestamps at the split's 138 // LAI after applying the split. Slow followers in that range could hear 139 // about these closed timestamp notifications before applying the split 140 // themselves. If these slow followers were allowed to pass these closed 141 // timestamps created after the split to the RHS replicas they create during 142 // the application of the split then these RHS replicas might end up with 143 // initialMaxClosed values above their current range's official closed 144 // timestamp. The leaseholder of the RHS range could then propose a write at 145 // a timestamp below this initialMaxClosed, violating the closed timestamp 146 // systems most important property. 147 // 148 // Using an LAI from before the index at which this split is applied avoids 149 // the hazard and ensures that no replica on the RHS is created with an 150 // initialMaxClosed that could be violated by a proposal on the RHS's 151 // initial leaseholder. See #44878. 152 initialMaxClosed, _ := r.maxClosed(ctx) 153 rightRepl.mu.Lock() 154 rightRepl.mu.initialMaxClosed = initialMaxClosed 155 rightRepl.mu.Unlock() 156 } 157 158 // splitPostApply is the part of the split trigger which coordinates the actual 159 // split with the Store. Requires that Replica.raftMu is held. 160 func splitPostApply( 161 ctx context.Context, deltaMS enginepb.MVCCStats, split *roachpb.SplitTrigger, r *Replica, 162 ) { 163 // rightReplOrNil will be nil if the RHS replica at the ID of the split is 164 // already known to be removed, generally because we know that this store has 165 // been re-added at a higher replica ID. 166 rightReplOrNil := prepareRightReplicaForSplit(ctx, split, r) 167 // Add the RHS replica to the store. This step atomically updates 168 // the EndKey of the LHS replica and also adds the RHS replica 169 // to the store's replica map. 170 if err := r.store.SplitRange(ctx, r, rightReplOrNil, split); err != nil { 171 // Our in-memory state has diverged from the on-disk state. 172 log.Fatalf(ctx, "%s: failed to update Store after split: %+v", r, err) 173 } 174 175 // Update store stats with difference in stats before and after split. 176 r.store.metrics.addMVCCStats(deltaMS) 177 178 now := r.store.Clock().Now() 179 180 // While performing the split, zone config changes or a newly created table 181 // might require the range to be split again. Enqueue both the left and right 182 // ranges to speed up such splits. See #10160. 183 r.store.splitQueue.MaybeAddAsync(ctx, r, now) 184 // If the range was not properly replicated before the split, the replicate 185 // queue may not have picked it up (due to the need for a split). Enqueue 186 // both the left and right ranges to speed up a potentially necessary 187 // replication. See #7022 and #7800. 188 r.store.replicateQueue.MaybeAddAsync(ctx, r, now) 189 190 if rightReplOrNil != nil { 191 r.store.splitQueue.MaybeAddAsync(ctx, rightReplOrNil, now) 192 r.store.replicateQueue.MaybeAddAsync(ctx, rightReplOrNil, now) 193 if len(split.RightDesc.Replicas().All()) == 1 { 194 // TODO(peter): In single-node clusters, we enqueue the right-hand side of 195 // the split (the new range) for Raft processing so that the corresponding 196 // Raft group is created. This shouldn't be necessary for correctness, but 197 // some tests rely on this (e.g. server.TestNodeStatusWritten). 198 r.store.enqueueRaftUpdateCheck(rightReplOrNil.RangeID) 199 } 200 } 201 } 202 203 // prepareRightReplicaForSplit a helper for splitPostApply. 204 // Requires that r.raftMu is held. 205 func prepareRightReplicaForSplit( 206 ctx context.Context, split *roachpb.SplitTrigger, r *Replica, 207 ) (rightReplicaOrNil *Replica) { 208 // The right hand side of the split was already created (and its raftMu 209 // acquired) in Replica.acquireSplitLock. It must be present here. 210 rightRepl, err := r.store.GetReplica(split.RightDesc.RangeID) 211 // If the RHS replica at the point of the split was known to be removed 212 // during the application of the split then we may not find it here. That's 213 // fine, carry on. See also: 214 _, _ = r.acquireSplitLock, splitPostApply 215 if roachpb.IsRangeNotFoundError(err) { 216 return nil 217 } 218 if err != nil { 219 log.Fatalf(ctx, "unable to find RHS replica: %+v", err) 220 } 221 // Already holding raftMu, see above. 222 rightRepl.mu.Lock() 223 224 // If we know that the RHS has already been removed at this replica ID 225 // then we also know that its data has already been removed by the preApply 226 // so we skip initializing it as the RHS of the split. 227 if rightRepl.isNewerThanSplitRLocked(split) { 228 rightRepl.mu.Unlock() 229 return nil 230 } 231 232 // Finish initialization of the RHS. 233 err = rightRepl.loadRaftMuLockedReplicaMuLocked(&split.RightDesc) 234 rightRepl.mu.Unlock() 235 if err != nil { 236 log.Fatalf(ctx, "%v", err) 237 } 238 239 // Copy the minLeaseProposedTS from the LHS and grab the RHS's lease. 240 r.mu.RLock() 241 rightRepl.mu.Lock() 242 rightRepl.mu.minLeaseProposedTS = r.mu.minLeaseProposedTS 243 rightLease := *rightRepl.mu.state.Lease 244 rightRepl.mu.Unlock() 245 r.mu.RUnlock() 246 247 // We need to explicitly wake up the Raft group on the right-hand range or 248 // else the range could be underreplicated for an indefinite period of time. 249 // 250 // Specifically, suppose one of the replicas of the left-hand range never 251 // applies this split trigger, e.g., because it catches up via a snapshot that 252 // advances it past this split. That store won't create the right-hand replica 253 // until it receives a Raft message addressed to the right-hand range. But 254 // since new replicas start out quiesced, unless we explicitly awaken the 255 // Raft group, there might not be any Raft traffic for quite a while. 256 err = rightRepl.withRaftGroup(true, func(r *raft.RawNode) (unquiesceAndWakeLeader bool, _ error) { 257 return true, nil 258 }) 259 if err != nil { 260 log.Fatalf(ctx, "unable to create raft group for right-hand range in split: %+v", err) 261 } 262 263 // Invoke the leasePostApply method to ensure we properly initialize 264 // the replica according to whether it holds the lease. This enables 265 // the txnWaitQueue. 266 rightRepl.leasePostApply(ctx, rightLease, false /* permitJump */) 267 return rightRepl 268 } 269 270 // SplitRange shortens the original range to accommodate the new range. The new 271 // range is added to the ranges map and the replicasByKey btree. origRng.raftMu 272 // and newRng.raftMu must be held. 273 // 274 // This is only called from the split trigger in the context of the execution 275 // of a Raft command. Note that rightRepl will be nil if the replica described 276 // by rightDesc is known to have been removed. 277 func (s *Store) SplitRange( 278 ctx context.Context, leftRepl, rightReplOrNil *Replica, split *roachpb.SplitTrigger, 279 ) error { 280 rightDesc := &split.RightDesc 281 newLeftDesc := &split.LeftDesc 282 oldLeftDesc := leftRepl.Desc() 283 if !bytes.Equal(oldLeftDesc.EndKey, rightDesc.EndKey) || 284 bytes.Compare(oldLeftDesc.StartKey, rightDesc.StartKey) >= 0 { 285 return errors.Errorf("left range is not splittable by right range: %+v, %+v", oldLeftDesc, rightDesc) 286 } 287 288 s.mu.Lock() 289 defer s.mu.Unlock() 290 if exRng, ok := s.mu.uninitReplicas[rightDesc.RangeID]; rightReplOrNil != nil && ok { 291 // If we have an uninitialized replica of the new range we require pointer 292 // equivalence with rightRepl. See Store.splitTriggerPostApply(). 293 if exRng != rightReplOrNil { 294 log.Fatalf(ctx, "found unexpected uninitialized replica: %s vs %s", exRng, rightReplOrNil) 295 } 296 // NB: We only remove from uninitReplicas and the replicaQueues maps here 297 // so that we don't leave open a window where a replica is temporarily not 298 // present in Store.mu.replicas. 299 delete(s.mu.uninitReplicas, rightDesc.RangeID) 300 s.replicaQueues.Delete(int64(rightDesc.RangeID)) 301 } 302 303 leftRepl.setDescRaftMuLocked(ctx, newLeftDesc) 304 305 // Clear the LHS lock and txn wait-queues, to redirect to the RHS if 306 // appropriate. We do this after setDescWithoutProcessUpdate to ensure 307 // that no pre-split commands are inserted into the wait-queues after we 308 // clear them. 309 leftRepl.concMgr.OnRangeSplit() 310 311 // Clear the original range's request stats, since they include requests for 312 // spans that are now owned by the new range. 313 leftRepl.leaseholderStats.resetRequestCounts() 314 315 if rightReplOrNil == nil { 316 throwawayRightWriteStats := new(replicaStats) 317 leftRepl.writeStats.splitRequestCounts(throwawayRightWriteStats) 318 } else { 319 rightRepl := rightReplOrNil 320 leftRepl.writeStats.splitRequestCounts(rightRepl.writeStats) 321 if err := s.addReplicaInternalLocked(rightRepl); err != nil { 322 return errors.Errorf("unable to add replica %v: %s", rightRepl, err) 323 } 324 325 // Update the replica's cached byte thresholds. This is a no-op if the system 326 // config is not available, in which case we rely on the next gossip update 327 // to perform the update. 328 if err := rightRepl.updateRangeInfo(rightRepl.Desc()); err != nil { 329 return err 330 } 331 // Add the range to metrics and maybe gossip on capacity change. 332 s.metrics.ReplicaCount.Inc(1) 333 s.maybeGossipOnCapacityChange(ctx, rangeAddEvent) 334 } 335 336 return nil 337 }