github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/store_create_replica.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 "time" 16 "unsafe" 17 18 "github.com/cockroachdb/cockroach/pkg/keys" 19 "github.com/cockroachdb/cockroach/pkg/roachpb" 20 "github.com/cockroachdb/cockroach/pkg/storage" 21 "github.com/cockroachdb/cockroach/pkg/util/hlc" 22 "github.com/cockroachdb/cockroach/pkg/util/log" 23 "github.com/cockroachdb/cockroach/pkg/util/retry" 24 "github.com/cockroachdb/errors" 25 ) 26 27 var errRetry = errors.New("retry: orphaned replica") 28 29 // getOrCreateReplica returns a replica for the given RangeID, creating an 30 // uninitialized replica if necessary. The caller must not hold the store's 31 // lock. The returned replica has Replica.raftMu locked and it is the caller's 32 // responsibility to unlock it. 33 func (s *Store) getOrCreateReplica( 34 ctx context.Context, 35 rangeID roachpb.RangeID, 36 replicaID roachpb.ReplicaID, 37 creatingReplica *roachpb.ReplicaDescriptor, 38 isLearner bool, 39 ) (_ *Replica, created bool, _ error) { 40 if replicaID == 0 { 41 log.Fatalf(ctx, "cannot construct a Replica for range %d with 0 id", rangeID) 42 } 43 // We need a retry loop as the replica we find in the map may be in the 44 // process of being removed or may need to be removed. Retries in the loop 45 // imply that a removal is actually being carried out, not that we're waiting 46 // on a queue. 47 r := retry.Start(retry.Options{ 48 InitialBackoff: time.Microsecond, 49 // Set the backoff up to only a small amount to wait for data that 50 // might need to be cleared. 51 MaxBackoff: 10 * time.Millisecond, 52 }) 53 for { 54 r.Next() 55 r, created, err := s.tryGetOrCreateReplica( 56 ctx, 57 rangeID, 58 replicaID, 59 creatingReplica, 60 isLearner, 61 ) 62 if errors.Is(err, errRetry) { 63 continue 64 } 65 if err != nil { 66 return nil, false, err 67 } 68 return r, created, err 69 } 70 } 71 72 // tryGetOrCreateReplica performs a single attempt at trying to lookup or 73 // create a replica. It will fail with errRetry if it finds a Replica that has 74 // been destroyed (and is no longer in Store.mu.replicas) or if during creation 75 // another goroutine gets there first. In either case, a subsequent call to 76 // tryGetOrCreateReplica will likely succeed, hence the loop in 77 // getOrCreateReplica. 78 func (s *Store) tryGetOrCreateReplica( 79 ctx context.Context, 80 rangeID roachpb.RangeID, 81 replicaID roachpb.ReplicaID, 82 creatingReplica *roachpb.ReplicaDescriptor, 83 isLearner bool, 84 ) (_ *Replica, created bool, _ error) { 85 // The common case: look up an existing (initialized) replica. 86 if value, ok := s.mu.replicas.Load(int64(rangeID)); ok { 87 repl := (*Replica)(value) 88 repl.raftMu.Lock() // not unlocked on success 89 repl.mu.Lock() 90 91 // The current replica is removed, go back around. 92 if repl.mu.destroyStatus.Removed() { 93 repl.mu.Unlock() 94 repl.raftMu.Unlock() 95 return nil, false, errRetry 96 } 97 98 // Drop messages from replicas we know to be too old. 99 if fromReplicaIsTooOld(repl, creatingReplica) { 100 repl.mu.Unlock() 101 repl.raftMu.Unlock() 102 return nil, false, roachpb.NewReplicaTooOldError(creatingReplica.ReplicaID) 103 } 104 105 // The current replica needs to be removed, remove it and go back around. 106 if toTooOld := repl.mu.replicaID < replicaID; toTooOld { 107 if shouldLog := log.V(1); shouldLog { 108 log.Infof(ctx, "found message for replica ID %d which is newer than %v", 109 replicaID, repl) 110 } 111 112 repl.mu.Unlock() 113 if err := s.removeReplicaRaftMuLocked(ctx, repl, replicaID, RemoveOptions{ 114 DestroyData: true, 115 }); err != nil { 116 log.Fatalf(ctx, "failed to remove replica: %v", err) 117 } 118 repl.raftMu.Unlock() 119 return nil, false, errRetry 120 } 121 defer repl.mu.Unlock() 122 123 if repl.mu.replicaID > replicaID { 124 // The sender is behind and is sending to an old replica. 125 // We could silently drop this message but this way we'll inform the 126 // sender that they may no longer exist. 127 repl.raftMu.Unlock() 128 return nil, false, &roachpb.RaftGroupDeletedError{} 129 } 130 if repl.mu.replicaID != replicaID { 131 // This case should have been caught by handleToReplicaTooOld. 132 log.Fatalf(ctx, "intended replica id %d unexpectedly does not match the current replica %v", 133 replicaID, repl) 134 } 135 return repl, false, nil 136 } 137 138 // No replica currently exists, so we'll try to create one. Before creating 139 // the replica, see if there is a tombstone which would indicate that this 140 // is a stale message. 141 // NB: we check this before creating a new Replica and adding it to the 142 // Store's Range map even though we must check it again after to avoid race 143 // conditions. This double-checked locking is an optimization to avoid this 144 // work when we know the Replica should not be created ahead of time. 145 tombstoneKey := keys.RangeTombstoneKey(rangeID) 146 var tombstone roachpb.RangeTombstone 147 if ok, err := storage.MVCCGetProto( 148 ctx, s.Engine(), tombstoneKey, hlc.Timestamp{}, &tombstone, storage.MVCCGetOptions{}, 149 ); err != nil { 150 return nil, false, err 151 } else if ok && replicaID != 0 && replicaID < tombstone.NextReplicaID { 152 return nil, false, &roachpb.RaftGroupDeletedError{} 153 } 154 155 // Create a new replica and lock it for raft processing. 156 uninitializedDesc := &roachpb.RangeDescriptor{ 157 RangeID: rangeID, 158 // NB: other fields are unknown; need to populate them from 159 // snapshot. 160 } 161 repl := newUnloadedReplica(ctx, uninitializedDesc, s, replicaID) 162 repl.creatingReplica = creatingReplica 163 repl.raftMu.Lock() // not unlocked 164 165 // Install the replica in the store's replica map. The replica is in an 166 // inconsistent state, but nobody will be accessing it while we hold its 167 // locks. 168 s.mu.Lock() 169 // Grab the internal Replica state lock to ensure nobody mucks with our 170 // replica even outside of raft processing. Have to do this after grabbing 171 // Store.mu to maintain lock ordering invariant. 172 repl.mu.Lock() 173 repl.mu.tombstoneMinReplicaID = tombstone.NextReplicaID 174 175 // NB: A Replica should never be in the store's replicas map with a nil 176 // descriptor. Assign it directly here. In the case that the Replica should 177 // exist (which we confirm with another check of the Tombstone below), we'll 178 // re-initialize the replica with the same uninitializedDesc. 179 // 180 // During short window between here and call to s.unlinkReplicaByRangeIDLocked() 181 // in the failure branch below, the Replica used to have a nil descriptor and 182 // was present in the map. While it was the case that the destroy status had 183 // been set, not every code path which inspects the descriptor checks the 184 // destroy status. 185 repl.mu.state.Desc = uninitializedDesc 186 // Add the range to range map, but not replicasByKey since the range's start 187 // key is unknown. The range will be added to replicasByKey later when a 188 // snapshot is applied. After unlocking Store.mu above, another goroutine 189 // might have snuck in and created the replica, so we retry on error. 190 if err := s.addReplicaToRangeMapLocked(repl); err != nil { 191 repl.mu.Unlock() 192 s.mu.Unlock() 193 repl.raftMu.Unlock() 194 return nil, false, errRetry 195 } 196 s.mu.uninitReplicas[repl.RangeID] = repl 197 s.mu.Unlock() // NB: unlocking out of order 198 199 // Initialize the Replica with the replicaID. 200 if err := func() error { 201 // Check for a tombstone again now that we've inserted into the Range 202 // map. This double-checked locking ensures that we avoid a race where a 203 // replica is created and destroyed between the initial unsynchronized 204 // tombstone check and the Range map linearization point. By checking 205 // again now, we make sure to synchronize with any goroutine that wrote 206 // a tombstone and then removed an old replica from the Range map. 207 if ok, err := storage.MVCCGetProto( 208 ctx, s.Engine(), tombstoneKey, hlc.Timestamp{}, &tombstone, storage.MVCCGetOptions{}, 209 ); err != nil { 210 return err 211 } else if ok && replicaID < tombstone.NextReplicaID { 212 return &roachpb.RaftGroupDeletedError{} 213 } 214 215 // An uninitialized replica should have an empty HardState.Commit at 216 // all times. Failure to maintain this invariant indicates corruption. 217 // And yet, we have observed this in the wild. See #40213. 218 if hs, err := repl.mu.stateLoader.LoadHardState(ctx, s.Engine()); err != nil { 219 return err 220 } else if hs.Commit != 0 { 221 log.Fatalf(ctx, "found non-zero HardState.Commit on uninitialized replica %s. HS=%+v", repl, hs) 222 } 223 return repl.loadRaftMuLockedReplicaMuLocked(uninitializedDesc) 224 }(); err != nil { 225 // Mark the replica as destroyed and remove it from the replicas maps to 226 // ensure nobody tries to use it. 227 repl.mu.destroyStatus.Set(errors.Wrapf(err, "%s: failed to initialize", repl), destroyReasonRemoved) 228 repl.mu.Unlock() 229 s.mu.Lock() 230 s.unlinkReplicaByRangeIDLocked(rangeID) 231 s.mu.Unlock() 232 repl.raftMu.Unlock() 233 return nil, false, err 234 } 235 repl.mu.Unlock() 236 return repl, true, nil 237 } 238 239 // isFromReplicaTooOld returns an true if the creatingReplica is deemed to be 240 // a member of the range which has been removed. 241 // Assumes toReplica.mu is held. 242 func fromReplicaIsTooOld(toReplica *Replica, fromReplica *roachpb.ReplicaDescriptor) bool { 243 toReplica.mu.AssertHeld() 244 if fromReplica == nil { 245 return false 246 } 247 desc := toReplica.mu.state.Desc 248 _, found := desc.GetReplicaDescriptorByID(fromReplica.ReplicaID) 249 return !found && fromReplica.ReplicaID < desc.NextReplicaID 250 } 251 252 // addReplicaInternalLocked adds the replica to the replicas map and the 253 // replicasByKey btree. Returns an error if a replica with 254 // the same Range ID or a KeyRange that overlaps has already been added to 255 // this store. addReplicaInternalLocked requires that the store lock is held. 256 func (s *Store) addReplicaInternalLocked(repl *Replica) error { 257 if !repl.IsInitialized() { 258 return errors.Errorf("attempted to add uninitialized replica %s", repl) 259 } 260 261 if err := s.addReplicaToRangeMapLocked(repl); err != nil { 262 return err 263 } 264 265 if exRange := s.getOverlappingKeyRangeLocked(repl.Desc()); exRange != nil { 266 return errors.Errorf("%s: cannot addReplicaInternalLocked; range %s has overlapping range %s", s, repl, exRange.Desc()) 267 } 268 269 if exRngItem := s.mu.replicasByKey.ReplaceOrInsert(repl); exRngItem != nil { 270 return errors.Errorf("%s: cannot addReplicaInternalLocked; range for key %v already exists in replicasByKey btree", s, 271 exRngItem.(KeyRange).startKey()) 272 } 273 274 return nil 275 } 276 277 // addPlaceholderLocked adds the specified placeholder. Requires that the 278 // raftMu of the replica whose place is being held is locked. 279 func (s *Store) addPlaceholder(placeholder *ReplicaPlaceholder) error { 280 s.mu.Lock() 281 defer s.mu.Unlock() 282 return s.addPlaceholderLocked(placeholder) 283 } 284 285 // addPlaceholderLocked adds the specified placeholder. Requires that Store.mu 286 // and the raftMu of the replica whose place is being held are locked. 287 func (s *Store) addPlaceholderLocked(placeholder *ReplicaPlaceholder) error { 288 rangeID := placeholder.Desc().RangeID 289 if exRng := s.mu.replicasByKey.ReplaceOrInsert(placeholder); exRng != nil { 290 return errors.Errorf("%s overlaps with existing KeyRange %s in replicasByKey btree", placeholder, exRng) 291 } 292 if exRng, ok := s.mu.replicaPlaceholders[rangeID]; ok { 293 return errors.Errorf("%s has ID collision with existing KeyRange %s", placeholder, exRng) 294 } 295 s.mu.replicaPlaceholders[rangeID] = placeholder 296 return nil 297 } 298 299 // addReplicaToRangeMapLocked adds the replica to the replicas map. 300 func (s *Store) addReplicaToRangeMapLocked(repl *Replica) error { 301 // It's ok for the replica to exist in the replicas map as long as it is the 302 // same replica object. This occurs during splits where the right-hand side 303 // is added to the replicas map before it is initialized. 304 if existing, loaded := s.mu.replicas.LoadOrStore( 305 int64(repl.RangeID), unsafe.Pointer(repl)); loaded && (*Replica)(existing) != repl { 306 return errors.Errorf("%s: replica already exists", repl) 307 } 308 // Check whether the replica is unquiesced but not in the map. This 309 // can happen during splits and merges, where the uninitialized (but 310 // also unquiesced) replica is removed from the unquiesced replica 311 // map in advance of this method being called. 312 s.unquiescedReplicas.Lock() 313 if _, ok := s.unquiescedReplicas.m[repl.RangeID]; !repl.mu.quiescent && !ok { 314 s.unquiescedReplicas.m[repl.RangeID] = struct{}{} 315 } 316 s.unquiescedReplicas.Unlock() 317 return nil 318 } 319 320 // maybeMarkReplicaInitializedLocked should be called whenever a previously 321 // unintialized replica has become initialized so that the store can update its 322 // internal bookkeeping. It requires that Store.mu and Replica.raftMu 323 // are locked. 324 func (s *Store) maybeMarkReplicaInitializedLocked(ctx context.Context, repl *Replica) error { 325 if !repl.IsInitialized() { 326 return errors.Errorf("attempted to process uninitialized range %s", repl) 327 } 328 329 rangeID := repl.RangeID 330 331 if _, ok := s.mu.uninitReplicas[rangeID]; !ok { 332 // Do nothing if the range has already been initialized. 333 return nil 334 } 335 delete(s.mu.uninitReplicas, rangeID) 336 337 if exRange := s.getOverlappingKeyRangeLocked(repl.Desc()); exRange != nil { 338 return errors.Errorf("%s: cannot initialize replica; range %s has overlapping range %s", 339 s, repl, exRange.Desc()) 340 } 341 if exRngItem := s.mu.replicasByKey.ReplaceOrInsert(repl); exRngItem != nil { 342 return errors.Errorf("range for key %v already exists in replicasByKey btree", 343 (exRngItem.(*Replica)).startKey()) 344 } 345 346 // Add the range to metrics and maybe gossip on capacity change. 347 s.metrics.ReplicaCount.Inc(1) 348 s.maybeGossipOnCapacityChange(ctx, rangeAddEvent) 349 350 return nil 351 }