github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_init.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 "math/rand" 16 "time" 17 18 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/abortspan" 19 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency" 20 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" 21 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/split" 22 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader" 23 "github.com/cockroachdb/cockroach/pkg/roachpb" 24 "github.com/cockroachdb/cockroach/pkg/rpc" 25 "github.com/cockroachdb/cockroach/pkg/util" 26 "github.com/cockroachdb/cockroach/pkg/util/log" 27 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 28 "github.com/cockroachdb/cockroach/pkg/util/uuid" 29 "github.com/cockroachdb/errors" 30 "go.etcd.io/etcd/raft" 31 ) 32 33 const ( 34 splitQueueThrottleDuration = 5 * time.Second 35 mergeQueueThrottleDuration = 5 * time.Second 36 ) 37 38 // newReplica constructs a new Replica. If the desc is initialized, the store 39 // must be present in it and the corresponding replica descriptor must have 40 // replicaID as its ReplicaID. 41 func newReplica( 42 ctx context.Context, desc *roachpb.RangeDescriptor, store *Store, replicaID roachpb.ReplicaID, 43 ) (*Replica, error) { 44 repl := newUnloadedReplica(ctx, desc, store, replicaID) 45 repl.raftMu.Lock() 46 defer repl.raftMu.Unlock() 47 repl.mu.Lock() 48 defer repl.mu.Unlock() 49 if err := repl.loadRaftMuLockedReplicaMuLocked(desc); err != nil { 50 return nil, err 51 } 52 return repl, nil 53 } 54 55 // newUnloadedReplica partially constructs a replica. The primary reason this 56 // function exists separately from Replica.loadRaftMuLockedReplicaMuLocked() is 57 // to avoid attempting to fully constructing a Replica prior to proving that it 58 // can exist during the delicate synchronization dance that occurs in 59 // Store.tryGetOrCreateReplica(). A Replica returned from this function must not 60 // be used in any way until it's load() method has been called. 61 func newUnloadedReplica( 62 ctx context.Context, desc *roachpb.RangeDescriptor, store *Store, replicaID roachpb.ReplicaID, 63 ) *Replica { 64 if replicaID == 0 { 65 log.Fatalf(context.TODO(), "cannot construct a replica for range %d with a 0 replica ID", desc.RangeID) 66 } 67 r := &Replica{ 68 AmbientContext: store.cfg.AmbientCtx, 69 RangeID: desc.RangeID, 70 store: store, 71 abortSpan: abortspan.New(desc.RangeID), 72 concMgr: concurrency.NewManager(concurrency.Config{ 73 NodeDesc: store.nodeDesc, 74 RangeDesc: desc, 75 Settings: store.ClusterSettings(), 76 DB: store.DB(), 77 Clock: store.Clock(), 78 Stopper: store.Stopper(), 79 IntentResolver: store.intentResolver, 80 TxnWaitMetrics: store.txnWaitMetrics, 81 SlowLatchGauge: store.metrics.SlowLatchRequests, 82 DisableTxnPushing: store.TestingKnobs().DontPushOnWriteIntentError, 83 TxnWaitKnobs: store.TestingKnobs().TxnWaitKnobs, 84 }), 85 } 86 r.mu.pendingLeaseRequest = makePendingLeaseRequest(r) 87 r.mu.stateLoader = stateloader.Make(desc.RangeID) 88 r.mu.quiescent = true 89 r.mu.zone = store.cfg.DefaultZoneConfig 90 r.mu.replicaID = replicaID 91 split.Init(&r.loadBasedSplitter, rand.Intn, func() float64 { 92 return float64(SplitByLoadQPSThreshold.Get(&store.cfg.Settings.SV)) 93 }) 94 r.mu.proposals = map[kvserverbase.CmdIDKey]*ProposalData{} 95 r.mu.checksums = map[uuid.UUID]ReplicaChecksum{} 96 r.mu.proposalBuf.Init((*replicaProposer)(r)) 97 98 if leaseHistoryMaxEntries > 0 { 99 r.leaseHistory = newLeaseHistory() 100 } 101 if store.cfg.StorePool != nil { 102 r.leaseholderStats = newReplicaStats(store.Clock(), store.cfg.StorePool.getNodeLocalityString) 103 } 104 // Pass nil for the localityOracle because we intentionally don't track the 105 // origin locality of write load. 106 r.writeStats = newReplicaStats(store.Clock(), nil) 107 108 // Init rangeStr with the range ID. 109 r.rangeStr.store(replicaID, &roachpb.RangeDescriptor{RangeID: desc.RangeID}) 110 // Add replica log tag - the value is rangeStr.String(). 111 r.AmbientContext.AddLogTag("r", &r.rangeStr) 112 // Add replica pointer value. NB: this was historically useful for debugging 113 // replica GC issues, but is a distraction at the moment. 114 // r.AmbientContext.AddLogTag("@", fmt.Sprintf("%x", unsafe.Pointer(r))) 115 r.raftMu.stateLoader = stateloader.Make(desc.RangeID) 116 117 r.splitQueueThrottle = util.Every(splitQueueThrottleDuration) 118 r.mergeQueueThrottle = util.Every(mergeQueueThrottleDuration) 119 return r 120 } 121 122 // loadRaftMuLockedReplicaMuLocked will load the state of the replica from disk. 123 // If desc is initialized, the Replica will be initialized when this method 124 // returns. An initialized Replica may not be reloaded. If this method is called 125 // with an uninitialized desc it may be called again later with an initialized 126 // desc. 127 // 128 // This method is called in three places: 129 // 130 // 1) newReplica - used when the store is initializing and during testing 131 // 2) tryGetOrCreateReplica - see newUnloadedReplica 132 // 3) splitPostApply - this call initializes a previously uninitialized Replica. 133 // 134 func (r *Replica) loadRaftMuLockedReplicaMuLocked(desc *roachpb.RangeDescriptor) error { 135 ctx := r.AnnotateCtx(context.TODO()) 136 if r.mu.state.Desc != nil && r.isInitializedRLocked() { 137 log.Fatalf(ctx, "r%d: cannot reinitialize an initialized replica", desc.RangeID) 138 } else if r.mu.replicaID == 0 { 139 // NB: This is just a defensive check as r.mu.replicaID should never be 0. 140 log.Fatalf(ctx, "r%d: cannot initialize replica without a replicaID", desc.RangeID) 141 } 142 143 // Clear the internal raft group in case we're being reset. Since we're 144 // reloading the raft state below, it isn't safe to use the existing raft 145 // group. 146 r.mu.internalRaftGroup = nil 147 148 var err error 149 if r.mu.state, err = r.mu.stateLoader.Load(ctx, r.Engine(), desc); err != nil { 150 return err 151 } 152 r.mu.lastIndex, err = r.mu.stateLoader.LoadLastIndex(ctx, r.Engine()) 153 if err != nil { 154 return err 155 } 156 r.mu.lastTerm = invalidLastTerm 157 158 // Ensure that we're not trying to load a replica with a different ID than 159 // was used to construct this Replica. 160 replicaID := r.mu.replicaID 161 if replicaDesc, found := r.mu.state.Desc.GetReplicaDescriptor(r.StoreID()); found { 162 replicaID = replicaDesc.ReplicaID 163 } else if desc.IsInitialized() { 164 log.Fatalf(ctx, "r%d: cannot initialize replica which is not in descriptor %v", desc.RangeID, desc) 165 } 166 if r.mu.replicaID != replicaID { 167 log.Fatalf(ctx, "attempting to initialize a replica which has ID %d with ID %d", 168 r.mu.replicaID, replicaID) 169 } 170 171 r.setDescLockedRaftMuLocked(ctx, desc) 172 173 // Init the minLeaseProposedTS such that we won't use an existing lease (if 174 // any). This is so that, after a restart, we don't propose under old leases. 175 // If the replica is being created through a split, this value will be 176 // overridden. 177 if !r.store.cfg.TestingKnobs.DontPreventUseOfOldLeaseOnStart { 178 // Only do this if there was a previous lease. This shouldn't be important 179 // to do but consider that the first lease which is obtained is back-dated 180 // to a zero start timestamp (and this de-flakes some tests). If we set the 181 // min proposed TS here, this lease could not be renewed (by the semantics 182 // of minLeaseProposedTS); and since minLeaseProposedTS is copied on splits, 183 // this problem would multiply to a number of replicas at cluster bootstrap. 184 // Instead, we make the first lease special (which is OK) and the problem 185 // disappears. 186 if r.mu.state.Lease.Sequence > 0 { 187 r.mu.minLeaseProposedTS = r.Clock().Now() 188 } 189 } 190 191 ssBase := r.Engine().GetAuxiliaryDir() 192 if r.raftMu.sideloaded, err = newDiskSideloadStorage( 193 r.store.cfg.Settings, 194 desc.RangeID, 195 replicaID, 196 ssBase, 197 r.store.limiters.BulkIOWriteRate, 198 r.store.engine, 199 ); err != nil { 200 return errors.Wrap(err, "while initializing sideloaded storage") 201 } 202 r.assertStateLocked(ctx, r.store.Engine()) 203 return nil 204 } 205 206 // IsInitialized is true if we know the metadata of this range, either 207 // because we created it or we have received an initial snapshot from 208 // another node. It is false when a range has been created in response 209 // to an incoming message but we are waiting for our initial snapshot. 210 func (r *Replica) IsInitialized() bool { 211 r.mu.RLock() 212 defer r.mu.RUnlock() 213 return r.isInitializedRLocked() 214 } 215 216 // isInitializedRLocked is true if we know the metadata of this range, either 217 // because we created it or we have received an initial snapshot from 218 // another node. It is false when a range has been created in response 219 // to an incoming message but we are waiting for our initial snapshot. 220 // isInitializedLocked requires that the replica lock is held. 221 func (r *Replica) isInitializedRLocked() bool { 222 return r.mu.state.Desc.IsInitialized() 223 } 224 225 // maybeInitializeRaftGroup check whether the internal Raft group has 226 // not yet been initialized. If not, it is created and set to campaign 227 // if this replica is the most recent owner of the range lease. 228 func (r *Replica) maybeInitializeRaftGroup(ctx context.Context) { 229 r.mu.RLock() 230 // If this replica hasn't initialized the Raft group, create it and 231 // unquiesce and wake the leader to ensure the replica comes up to date. 232 initialized := r.mu.internalRaftGroup != nil 233 // If this replica has been removed or is in the process of being removed 234 // then it'll never handle any raft events so there's no reason to initialize 235 // it now. 236 removed := !r.mu.destroyStatus.IsAlive() 237 r.mu.RUnlock() 238 if initialized || removed { 239 return 240 } 241 242 // Acquire raftMu, but need to maintain lock ordering (raftMu then mu). 243 r.raftMu.Lock() 244 defer r.raftMu.Unlock() 245 r.mu.Lock() 246 defer r.mu.Unlock() 247 248 // If we raced on checking the destroyStatus above that's fine as 249 // the below withRaftGroupLocked will no-op. 250 if err := r.withRaftGroupLocked(true, func(raftGroup *raft.RawNode) (bool, error) { 251 return true, nil 252 }); err != nil && !errors.Is(err, errRemoved) { 253 log.VErrEventf(ctx, 1, "unable to initialize raft group: %s", err) 254 } 255 } 256 257 // setDescRaftMuLocked atomically sets the replica's descriptor. It requires raftMu to be 258 // locked. 259 func (r *Replica) setDescRaftMuLocked(ctx context.Context, desc *roachpb.RangeDescriptor) { 260 r.mu.Lock() 261 defer r.mu.Unlock() 262 r.setDescLockedRaftMuLocked(ctx, desc) 263 } 264 265 func (r *Replica) setDescLockedRaftMuLocked(ctx context.Context, desc *roachpb.RangeDescriptor) { 266 if desc.RangeID != r.RangeID { 267 log.Fatalf(ctx, "range descriptor ID (%d) does not match replica's range ID (%d)", 268 desc.RangeID, r.RangeID) 269 } 270 if r.mu.state.Desc.IsInitialized() && 271 (desc == nil || !desc.IsInitialized()) { 272 log.Fatalf(ctx, "cannot replace initialized descriptor with uninitialized one: %+v -> %+v", 273 r.mu.state.Desc, desc) 274 } 275 if r.mu.state.Desc.IsInitialized() && 276 !r.mu.state.Desc.StartKey.Equal(desc.StartKey) { 277 log.Fatalf(ctx, "attempted to change replica's start key from %s to %s", 278 r.mu.state.Desc.StartKey, desc.StartKey) 279 } 280 281 // NB: It might be nice to assert that the current replica exists in desc 282 // however we allow it to not be present for three reasons: 283 // 284 // 1) When removing the current replica we update the descriptor to the point 285 // of removal even though we will delete the Replica's data in the same 286 // batch. We could avoid setting the local descriptor in this case. 287 // 2) When the DisableEagerReplicaRemoval testing knob is enabled. We 288 // could remove all tests which utilize this behavior now that there's 289 // no other mechanism for range state which does not contain the current 290 // store to exist on disk. 291 // 3) Various unit tests do not provide a valid descriptor. 292 replDesc, found := desc.GetReplicaDescriptor(r.StoreID()) 293 if found && replDesc.ReplicaID != r.mu.replicaID { 294 log.Fatalf(ctx, "attempted to change replica's ID from %d to %d", 295 r.mu.replicaID, replDesc.ReplicaID) 296 } 297 298 // Determine if a new replica was added. This is true if the new max replica 299 // ID is greater than the old max replica ID. 300 oldMaxID := maxReplicaIDOfAny(r.mu.state.Desc) 301 newMaxID := maxReplicaIDOfAny(desc) 302 if newMaxID > oldMaxID { 303 r.mu.lastReplicaAdded = newMaxID 304 r.mu.lastReplicaAddedTime = timeutil.Now() 305 } else if r.mu.lastReplicaAdded > newMaxID { 306 // The last replica added was removed. 307 r.mu.lastReplicaAdded = 0 308 r.mu.lastReplicaAddedTime = time.Time{} 309 } 310 311 r.rangeStr.store(r.mu.replicaID, desc) 312 r.connectionClass.set(rpc.ConnectionClassForKey(desc.StartKey)) 313 r.concMgr.OnRangeDescUpdated(desc) 314 r.mu.state.Desc = desc 315 }