vitess.io/vitess@v0.16.2/go/vt/vtctl/reparentutil/emergency_reparenter.go (about) 1 /* 2 Copyright 2021 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package reparentutil 18 19 import ( 20 "context" 21 "fmt" 22 "sync" 23 "time" 24 25 "google.golang.org/protobuf/proto" 26 27 "k8s.io/apimachinery/pkg/util/sets" 28 29 "vitess.io/vitess/go/event" 30 "vitess.io/vitess/go/mysql" 31 "vitess.io/vitess/go/stats" 32 "vitess.io/vitess/go/vt/concurrency" 33 "vitess.io/vitess/go/vt/logutil" 34 "vitess.io/vitess/go/vt/topo" 35 "vitess.io/vitess/go/vt/topo/topoproto" 36 "vitess.io/vitess/go/vt/topotools/events" 37 "vitess.io/vitess/go/vt/vtctl/reparentutil/promotionrule" 38 "vitess.io/vitess/go/vt/vterrors" 39 "vitess.io/vitess/go/vt/vttablet/tmclient" 40 41 logutilpb "vitess.io/vitess/go/vt/proto/logutil" 42 replicationdatapb "vitess.io/vitess/go/vt/proto/replicationdata" 43 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 44 "vitess.io/vitess/go/vt/proto/vtrpc" 45 ) 46 47 // EmergencyReparenter performs EmergencyReparentShard operations. 48 type EmergencyReparenter struct { 49 ts *topo.Server 50 tmc tmclient.TabletManagerClient 51 logger logutil.Logger 52 } 53 54 // EmergencyReparentOptions provides optional parameters to 55 // EmergencyReparentShard operations. Options are passed by value, so it is safe 56 // for callers to mutate and reuse options structs for multiple calls. 57 type EmergencyReparentOptions struct { 58 NewPrimaryAlias *topodatapb.TabletAlias 59 IgnoreReplicas sets.Set[string] 60 WaitReplicasTimeout time.Duration 61 PreventCrossCellPromotion bool 62 63 // Private options managed internally. We use value passing to avoid leaking 64 // these details back out. 65 lockAction string 66 durability Durabler 67 } 68 69 // counters for Emergency Reparent Shard 70 var ( 71 ersCounter = stats.NewGauge("ers_counter", "Number of times Emergency Reparent Shard has been run") 72 ersSuccessCounter = stats.NewGauge("ers_success_counter", "Number of times Emergency Reparent Shard has succeeded") 73 ersFailureCounter = stats.NewGauge("ers_failure_counter", "Number of times Emergency Reparent Shard has failed") 74 ) 75 76 // NewEmergencyReparenter returns a new EmergencyReparenter object, ready to 77 // perform EmergencyReparentShard operations using the given topo.Server, 78 // TabletManagerClient, and logger. 79 // 80 // Providing a nil logger instance is allowed. 81 func NewEmergencyReparenter(ts *topo.Server, tmc tmclient.TabletManagerClient, logger logutil.Logger) *EmergencyReparenter { 82 erp := EmergencyReparenter{ 83 ts: ts, 84 tmc: tmc, 85 logger: logger, 86 } 87 88 if erp.logger == nil { 89 // Create a no-op logger so we can call functions on er.logger without 90 // needed to constantly check for non-nil. 91 erp.logger = logutil.NewCallbackLogger(func(*logutilpb.Event) {}) 92 } 93 94 return &erp 95 } 96 97 // ReparentShard performs the EmergencyReparentShard operation on the given 98 // keyspace and shard. 99 func (erp *EmergencyReparenter) ReparentShard(ctx context.Context, keyspace string, shard string, opts EmergencyReparentOptions) (*events.Reparent, error) { 100 var err error 101 // First step is to lock the shard for the given operation, if not already locked 102 if err = topo.CheckShardLocked(ctx, keyspace, shard); err != nil { 103 var unlock func(*error) 104 opts.lockAction = erp.getLockAction(opts.NewPrimaryAlias) 105 ctx, unlock, err = erp.ts.LockShard(ctx, keyspace, shard, opts.lockAction) 106 if err != nil { 107 return nil, err 108 } 109 defer unlock(&err) 110 } 111 112 // dispatch success or failure of ERS 113 ev := &events.Reparent{} 114 defer func() { 115 switch err { 116 case nil: 117 ersSuccessCounter.Add(1) 118 event.DispatchUpdate(ev, "finished EmergencyReparentShard") 119 default: 120 ersFailureCounter.Add(1) 121 event.DispatchUpdate(ev, "failed EmergencyReparentShard: "+err.Error()) 122 } 123 }() 124 125 err = erp.reparentShardLocked(ctx, ev, keyspace, shard, opts) 126 127 return ev, err 128 } 129 130 func (erp *EmergencyReparenter) getLockAction(newPrimaryAlias *topodatapb.TabletAlias) string { 131 action := "EmergencyReparentShard" 132 133 if newPrimaryAlias != nil { 134 action += fmt.Sprintf("(%v)", topoproto.TabletAliasString(newPrimaryAlias)) 135 } 136 137 return action 138 } 139 140 // reparentShardLocked performs Emergency Reparent Shard operation assuming that the shard is already locked 141 func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *events.Reparent, keyspace, shard string, opts EmergencyReparentOptions) (err error) { 142 // log the starting of the operation and increment the counter 143 erp.logger.Infof("will initiate emergency reparent shard in keyspace - %s, shard - %s", keyspace, shard) 144 ersCounter.Add(1) 145 146 var ( 147 stoppedReplicationSnapshot *replicationSnapshot 148 shardInfo *topo.ShardInfo 149 prevPrimary *topodatapb.Tablet 150 tabletMap map[string]*topo.TabletInfo 151 validCandidates map[string]mysql.Position 152 intermediateSource *topodatapb.Tablet 153 validCandidateTablets []*topodatapb.Tablet 154 validReplacementCandidates []*topodatapb.Tablet 155 betterCandidate *topodatapb.Tablet 156 isIdeal bool 157 ) 158 159 shardInfo, err = erp.ts.GetShard(ctx, keyspace, shard) 160 if err != nil { 161 return err 162 } 163 ev.ShardInfo = *shardInfo 164 165 keyspaceDurability, err := erp.ts.GetKeyspaceDurability(ctx, keyspace) 166 if err != nil { 167 return err 168 } 169 170 erp.logger.Infof("Getting a new durability policy for %v", keyspaceDurability) 171 opts.durability, err = GetDurabilityPolicy(keyspaceDurability) 172 if err != nil { 173 return err 174 } 175 176 // get the previous primary according to the topology server, 177 // we use this information to choose the best candidate in the same cell 178 // and to undo promotion in case of failure 179 if shardInfo.PrimaryAlias != nil { 180 prevPrimaryInfo, err := erp.ts.GetTablet(ctx, shardInfo.PrimaryAlias) 181 if err != nil { 182 return err 183 } 184 prevPrimary = prevPrimaryInfo.Tablet 185 } 186 187 // read all the tablets and their information 188 event.DispatchUpdate(ev, "reading all tablets") 189 tabletMap, err = erp.ts.GetTabletMapForShard(ctx, keyspace, shard) 190 if err != nil { 191 return vterrors.Wrapf(err, "failed to get tablet map for %v/%v: %v", keyspace, shard, err) 192 } 193 194 // Stop replication on all the tablets and build their status map 195 stoppedReplicationSnapshot, err = stopReplicationAndBuildStatusMaps(ctx, erp.tmc, ev, tabletMap, topo.RemoteOperationTimeout, opts.IgnoreReplicas, opts.NewPrimaryAlias, opts.durability, erp.logger) 196 if err != nil { 197 return vterrors.Wrapf(err, "failed to stop replication and build status maps: %v", err) 198 } 199 200 // check that we still have the shard lock. If we don't then we can terminate at this point 201 if err := topo.CheckShardLocked(ctx, keyspace, shard); err != nil { 202 return vterrors.Wrapf(err, "lost topology lock, aborting: %v", err) 203 } 204 205 // find the valid candidates for becoming the primary 206 // this is where we check for errant GTIDs and remove the tablets that have them from consideration 207 validCandidates, err = FindValidEmergencyReparentCandidates(stoppedReplicationSnapshot.statusMap, stoppedReplicationSnapshot.primaryStatusMap) 208 if err != nil { 209 return err 210 } 211 // Restrict the valid candidates list. We remove any tablet which is of the type DRAINED, RESTORE or BACKUP. 212 validCandidates, err = restrictValidCandidates(validCandidates, tabletMap) 213 if err != nil { 214 return err 215 } else if len(validCandidates) == 0 { 216 return vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "no valid candidates for emergency reparent") 217 } 218 219 // Wait for all candidates to apply relay logs 220 if err = erp.waitForAllRelayLogsToApply(ctx, validCandidates, tabletMap, stoppedReplicationSnapshot.statusMap, opts.WaitReplicasTimeout); err != nil { 221 return err 222 } 223 224 // Find the intermediate source for replication that we want other tablets to replicate from. 225 // This step chooses the most advanced tablet. Further ties are broken by using the promotion rule. 226 // In case the user has specified a tablet specifically, then it is selected, as long as it is the most advanced. 227 // Here we also check for split brain scenarios and check that the selected replica must be more advanced than all the other valid candidates. 228 // We fail in case there is a split brain detected. 229 // The validCandidateTablets list is sorted by the replication positions with ties broken by promotion rules. 230 intermediateSource, validCandidateTablets, err = erp.findMostAdvanced(validCandidates, tabletMap, opts) 231 if err != nil { 232 return err 233 } 234 erp.logger.Infof("intermediate source selected - %v", intermediateSource.Alias) 235 236 // After finding the intermediate source, we want to filter the valid candidate list by the following criteria - 237 // 1. Only keep the tablets which can make progress after being promoted (have sufficient reachable semi-sync ackers) 238 // 2. Remove the tablets with the Must_not promote rule 239 // 3. Remove cross-cell tablets if PreventCrossCellPromotion is specified 240 // Our final primary candidate MUST belong to this list of valid candidates 241 validCandidateTablets, err = erp.filterValidCandidates(validCandidateTablets, stoppedReplicationSnapshot.reachableTablets, prevPrimary, opts) 242 if err != nil { 243 return err 244 } 245 246 // Check whether the intermediate source candidate selected is ideal or if it can be improved later. 247 // If the intermediateSource is ideal, then we can be certain that it is part of the valid candidates list. 248 isIdeal, err = erp.isIntermediateSourceIdeal(intermediateSource, validCandidateTablets, tabletMap, opts) 249 if err != nil { 250 return err 251 } 252 erp.logger.Infof("intermediate source is ideal candidate- %v", isIdeal) 253 254 // Check (again) we still have the topology lock. 255 if err = topo.CheckShardLocked(ctx, keyspace, shard); err != nil { 256 return vterrors.Wrapf(err, "lost topology lock, aborting: %v", err) 257 } 258 259 // initialize the newPrimary with the intermediate source, override this value if it is not the ideal candidate 260 newPrimary := intermediateSource 261 if !isIdeal { 262 // we now reparent all the tablets to start replicating from the intermediate source 263 // we do not promote the tablet or change the shard record. We only change the replication for all the other tablets 264 // it also returns the list of the tablets that started replication successfully including itself part of the validCandidateTablets list. 265 // These are the candidates that we can use to find a replacement. 266 validReplacementCandidates, err = erp.promoteIntermediateSource(ctx, ev, intermediateSource, tabletMap, stoppedReplicationSnapshot.statusMap, validCandidateTablets, opts) 267 if err != nil { 268 return err 269 } 270 271 // try to find a better candidate using the list we got back 272 // We prefer to choose a candidate which is in the same cell as our previous primary and of the best possible durability rule. 273 // However, if there is an explicit request from the user to promote a specific tablet, then we choose that tablet. 274 betterCandidate, err = erp.identifyPrimaryCandidate(intermediateSource, validReplacementCandidates, tabletMap, opts) 275 if err != nil { 276 return err 277 } 278 279 // if our better candidate is different from our intermediate source, then we wait for it to catch up to the intermediate source 280 if !topoproto.TabletAliasEqual(betterCandidate.Alias, intermediateSource.Alias) { 281 err = waitForCatchUp(ctx, erp.tmc, erp.logger, betterCandidate, intermediateSource, opts.WaitReplicasTimeout) 282 if err != nil { 283 return err 284 } 285 newPrimary = betterCandidate 286 } 287 } 288 289 // The new primary which will be promoted will always belong to the validCandidateTablets list because - 290 // 1. if the intermediate source is ideal - then we know the intermediate source was in the validCandidateTablets list 291 // since we used that list 292 // 2. if the intermediate source isn't ideal - we take the intersection of the validCandidateTablets list and the one we 293 // were able to reach during the promotion of intermediate source, as possible candidates. So the final candidate (even if 294 // it is the intermediate source itself) will belong to the list 295 // Since the new primary tablet belongs to the validCandidateTablets list, we no longer need any additional constraint checks 296 297 // Final step is to promote our primary candidate 298 err = erp.promoteNewPrimary(ctx, ev, newPrimary, opts, tabletMap, stoppedReplicationSnapshot.statusMap) 299 if err != nil { 300 return err 301 } 302 303 ev.NewPrimary = proto.Clone(newPrimary).(*topodatapb.Tablet) 304 return err 305 } 306 307 func (erp *EmergencyReparenter) waitForAllRelayLogsToApply( 308 ctx context.Context, 309 validCandidates map[string]mysql.Position, 310 tabletMap map[string]*topo.TabletInfo, 311 statusMap map[string]*replicationdatapb.StopReplicationStatus, 312 waitReplicasTimeout time.Duration, 313 ) error { 314 errCh := make(chan concurrency.Error) 315 defer close(errCh) 316 317 groupCtx, groupCancel := context.WithTimeout(ctx, waitReplicasTimeout) 318 defer groupCancel() 319 320 waiterCount := 0 321 322 for candidate := range validCandidates { 323 // When we called stopReplicationAndBuildStatusMaps, we got back two 324 // maps: (1) the StopReplicationStatus of any replicas that actually 325 // stopped replication; and (2) the PrimaryStatus of anything that 326 // returned ErrNotReplica, which is a tablet that is either the current 327 // primary or is stuck thinking it is a PRIMARY but is not in actuality. 328 // 329 // If we have a tablet in the validCandidates map that does not appear 330 // in the statusMap, then we have either (a) the current primary, which 331 // is not replicating, so it is not applying relay logs; or (b) a tablet 332 // that is stuck thinking it is PRIMARY but is not in actuality. In that 333 // second case - (b) - we will most likely find that the stuck PRIMARY 334 // does not have a winning position, and fail the ERS. If, on the other 335 // hand, it does have a winning position, we are trusting the operator 336 // to know what they are doing by emergency-reparenting onto that 337 // tablet. In either case, it does not make sense to wait for relay logs 338 // to apply on a tablet that was never applying relay logs in the first 339 // place, so we skip it, and log that we did. 340 status, ok := statusMap[candidate] 341 if !ok { 342 erp.logger.Infof("EmergencyReparent candidate %v not in replica status map; this means it was not running replication (because it was formerly PRIMARY), so skipping WaitForRelayLogsToApply step for this candidate", candidate) 343 continue 344 } 345 346 go func(alias string, status *replicationdatapb.StopReplicationStatus) { 347 var err error 348 defer func() { 349 errCh <- concurrency.Error{ 350 Err: err, 351 } 352 }() 353 err = WaitForRelayLogsToApply(groupCtx, erp.tmc, tabletMap[alias], status) 354 }(candidate, status) 355 356 waiterCount++ 357 } 358 359 errgroup := concurrency.ErrorGroup{ 360 NumGoroutines: waiterCount, 361 NumRequiredSuccesses: waiterCount, 362 NumAllowedErrors: 0, 363 } 364 rec := errgroup.Wait(groupCancel, errCh) 365 366 if len(rec.Errors) != 0 { 367 return vterrors.Wrapf(rec.Error(), "could not apply all relay logs within the provided waitReplicasTimeout (%s): %v", waitReplicasTimeout, rec.Error()) 368 } 369 370 return nil 371 } 372 373 // findMostAdvanced finds the intermediate source for ERS. We always choose the most advanced one from our valid candidates list. Further ties are broken by looking at the promotion rules. 374 func (erp *EmergencyReparenter) findMostAdvanced( 375 validCandidates map[string]mysql.Position, 376 tabletMap map[string]*topo.TabletInfo, 377 opts EmergencyReparentOptions, 378 ) (*topodatapb.Tablet, []*topodatapb.Tablet, error) { 379 erp.logger.Infof("started finding the intermediate source") 380 // convert the valid candidates into a list so that we can use it for sorting 381 validTablets, tabletPositions, err := getValidCandidatesAndPositionsAsList(validCandidates, tabletMap) 382 if err != nil { 383 return nil, nil, err 384 } 385 386 // sort the tablets for finding the best intermediate source in ERS 387 err = sortTabletsForReparent(validTablets, tabletPositions, opts.durability) 388 if err != nil { 389 return nil, nil, err 390 } 391 for _, tablet := range validTablets { 392 erp.logger.Infof("finding intermediate source - sorted replica: %v", tablet.Alias) 393 } 394 395 // The first tablet in the sorted list will be the most eligible candidate unless explicitly asked for some other tablet 396 winningPrimaryTablet := validTablets[0] 397 winningPosition := tabletPositions[0] 398 399 // We have already removed the tablets with errant GTIDs before calling this function. At this point our winning position must be a 400 // superset of all the other valid positions. If that is not the case, then we have a split brain scenario, and we should cancel the ERS 401 for i, position := range tabletPositions { 402 if !winningPosition.AtLeast(position) { 403 return nil, nil, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "split brain detected between servers - %v and %v", winningPrimaryTablet.Alias, validTablets[i].Alias) 404 } 405 } 406 407 // If we were requested to elect a particular primary, verify it's a valid 408 // candidate (non-zero position, no errant GTIDs) 409 if opts.NewPrimaryAlias != nil { 410 requestedPrimaryAlias := topoproto.TabletAliasString(opts.NewPrimaryAlias) 411 pos, ok := validCandidates[requestedPrimaryAlias] 412 if !ok { 413 return nil, nil, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "requested primary elect %v has errant GTIDs", requestedPrimaryAlias) 414 } 415 // if the requested tablet is as advanced as the most advanced tablet, then we can just use it for promotion. 416 // otherwise, we should let it catchup to the most advanced tablet and not change the intermediate source 417 if pos.AtLeast(winningPosition) { 418 requestedPrimaryInfo, isFound := tabletMap[requestedPrimaryAlias] 419 if !isFound { 420 return nil, nil, vterrors.Errorf(vtrpc.Code_INTERNAL, "candidate %v not found in the tablet map; this an impossible situation", requestedPrimaryAlias) 421 } 422 winningPrimaryTablet = requestedPrimaryInfo.Tablet 423 } 424 } 425 426 return winningPrimaryTablet, validTablets, nil 427 } 428 429 // promoteIntermediateSource reparents all the other tablets to start replicating from the intermediate source. 430 // It does not promote this tablet to a primary instance, we only let other replicas start replicating from this tablet 431 func (erp *EmergencyReparenter) promoteIntermediateSource( 432 ctx context.Context, 433 ev *events.Reparent, 434 source *topodatapb.Tablet, 435 tabletMap map[string]*topo.TabletInfo, 436 statusMap map[string]*replicationdatapb.StopReplicationStatus, 437 validCandidateTablets []*topodatapb.Tablet, 438 opts EmergencyReparentOptions, 439 ) ([]*topodatapb.Tablet, error) { 440 // we reparent all the other tablets to start replication from our new source 441 // we wait for all the replicas so that we can choose a better candidate from the ones that started replication later 442 reachableTablets, err := erp.reparentReplicas(ctx, ev, source, tabletMap, statusMap, opts, true /* waitForAllReplicas */, false /* populateReparentJournal */) 443 if err != nil { 444 return nil, err 445 } 446 447 // also include the current tablet for being considered as part of valid candidates for ERS promotion 448 reachableTablets = append(reachableTablets, source) 449 450 // The only valid candidates for improvement are the ones which are reachable and part of the valid candidate list. 451 // Here we need to be careful not to mess up the ordering of tablets in validCandidateTablets, since the list is sorted by the 452 // replication positions. 453 var validCandidatesForImprovement []*topodatapb.Tablet 454 for _, tablet := range validCandidateTablets { 455 if topoproto.IsTabletInList(tablet, reachableTablets) { 456 validCandidatesForImprovement = append(validCandidatesForImprovement, tablet) 457 } 458 } 459 return validCandidatesForImprovement, nil 460 } 461 462 // reparentReplicas reparents all the replicas provided and populates the reparent journal on the primary if asked. 463 // Also, it returns the replicas which started replicating only in the case where we wait for all the replicas 464 func (erp *EmergencyReparenter) reparentReplicas( 465 ctx context.Context, 466 ev *events.Reparent, 467 newPrimaryTablet *topodatapb.Tablet, 468 tabletMap map[string]*topo.TabletInfo, 469 statusMap map[string]*replicationdatapb.StopReplicationStatus, 470 opts EmergencyReparentOptions, 471 waitForAllReplicas bool, 472 populateReparentJournal bool, 473 ) ([]*topodatapb.Tablet, error) { 474 475 var ( 476 replicasStartedReplication []*topodatapb.Tablet 477 replicaMutex sync.Mutex 478 ) 479 480 replCtx, replCancel := context.WithTimeout(context.Background(), opts.WaitReplicasTimeout) 481 482 event.DispatchUpdate(ev, "reparenting all tablets") 483 484 // Create a context and cancel function to watch for the first successful 485 // SetReplicationSource call on a replica. We use a background context so that this 486 // context is only ever Done when its cancel is called by the background 487 // goroutine we're about to spin up. 488 // 489 // Similarly, create a context and cancel for the replica waiter goroutine 490 // to signal when all replica goroutines have finished. In the case where at 491 // least one replica succeeds, replSuccessCtx will be canceled first, while 492 // allReplicasDoneCtx is guaranteed to be canceled within 493 // opts.WaitReplicasTimeout plus some jitter. 494 replSuccessCtx, replSuccessCancel := context.WithCancel(context.Background()) 495 allReplicasDoneCtx, allReplicasDoneCancel := context.WithCancel(context.Background()) 496 497 now := time.Now().UnixNano() 498 replWg := sync.WaitGroup{} 499 rec := concurrency.AllErrorRecorder{} 500 501 handlePrimary := func(alias string, tablet *topodatapb.Tablet) error { 502 position, err := erp.tmc.PrimaryPosition(replCtx, tablet) 503 if err != nil { 504 return err 505 } 506 if populateReparentJournal { 507 erp.logger.Infof("populating reparent journal on new primary %v", alias) 508 return erp.tmc.PopulateReparentJournal(replCtx, tablet, now, opts.lockAction, newPrimaryTablet.Alias, position) 509 } 510 return nil 511 } 512 513 handleReplica := func(alias string, ti *topo.TabletInfo) { 514 defer replWg.Done() 515 erp.logger.Infof("setting new primary on replica %v", alias) 516 517 forceStart := false 518 if status, ok := statusMap[alias]; ok { 519 fs, err := ReplicaWasRunning(status) 520 if err != nil { 521 err = vterrors.Wrapf(err, "tablet %v could not determine StopReplicationStatus: %v", alias, err) 522 rec.RecordError(err) 523 524 return 525 } 526 527 forceStart = fs 528 } 529 530 err := erp.tmc.SetReplicationSource(replCtx, ti.Tablet, newPrimaryTablet.Alias, 0, "", forceStart, IsReplicaSemiSync(opts.durability, newPrimaryTablet, ti.Tablet)) 531 if err != nil { 532 err = vterrors.Wrapf(err, "tablet %v SetReplicationSource failed: %v", alias, err) 533 rec.RecordError(err) 534 535 return 536 } 537 538 replicaMutex.Lock() 539 replicasStartedReplication = append(replicasStartedReplication, ti.Tablet) 540 replicaMutex.Unlock() 541 542 // Signal that at least one goroutine succeeded to SetReplicationSource. 543 // We do this only when we do not want to wait for all the replicas 544 if !waitForAllReplicas { 545 replSuccessCancel() 546 } 547 } 548 549 numReplicas := 0 550 551 for alias, ti := range tabletMap { 552 switch { 553 case alias == topoproto.TabletAliasString(newPrimaryTablet.Alias): 554 continue 555 case !opts.IgnoreReplicas.Has(alias): 556 replWg.Add(1) 557 numReplicas++ 558 go handleReplica(alias, ti) 559 } 560 } 561 562 // Spin up a background goroutine to wait until all replica goroutines 563 // finished. Polling this way allows us to have reparentReplicas return 564 // success as soon as (a) the primary successfully populates its reparent 565 // journal and (b) at least one replica successfully begins replicating. 566 // 567 // If we were to follow the more common pattern of blocking on replWg.Wait() 568 // in the main body of promoteNewPrimary, we would be bound to the 569 // time of slowest replica, instead of the time of the fastest successful 570 // replica, and we want ERS to be fast. 571 go func() { 572 replWg.Wait() 573 allReplicasDoneCancel() 574 }() 575 576 primaryErr := handlePrimary(topoproto.TabletAliasString(newPrimaryTablet.Alias), newPrimaryTablet) 577 if primaryErr != nil { 578 erp.logger.Warningf("primary failed to PopulateReparentJournal") 579 replCancel() 580 581 return nil, vterrors.Wrapf(primaryErr, "failed to PopulateReparentJournal on primary: %v", primaryErr) 582 } 583 584 // We should only cancel the context that all the replicas are using when they are done. 585 // Since this function can return early when only 1 replica succeeds, if we cancel this context as a deferred call from this function, 586 // then we would end up having cancelled the context for the replicas who have not yet finished running all the commands. 587 // This leads to some replicas not starting replication properly. So we must wait for all the replicas to finish before cancelling this context. 588 go func() { 589 replWg.Wait() 590 defer replCancel() 591 }() 592 593 select { 594 case <-replSuccessCtx.Done(): 595 // At least one replica was able to SetReplicationSource successfully 596 // Here we do not need to return the replicas which started replicating 597 return nil, nil 598 case <-allReplicasDoneCtx.Done(): 599 // There are certain timing issues between replSuccessCtx.Done firing 600 // and allReplicasDoneCtx.Done firing, so we check again if truly all 601 // replicas failed (where `numReplicas` goroutines recorded an error) or 602 // one or more actually managed to succeed. 603 errCount := len(rec.Errors) 604 605 switch { 606 case errCount > numReplicas: 607 // Technically, rec.Errors should never be greater than numReplicas, 608 // but it's better to err on the side of caution here, but also 609 // we're going to be explicit that this is doubly unexpected. 610 return nil, vterrors.Wrapf(rec.Error(), "received more errors (= %d) than replicas (= %d), which should be impossible: %v", errCount, numReplicas, rec.Error()) 611 case errCount == numReplicas: 612 if len(tabletMap) <= 2 { 613 // If there are at most 2 tablets in the tablet map, we shouldn't be failing the promotion if the replica fails to SetReplicationSource. 614 // The failing replica is probably the old primary that is down, so it is okay if it fails. We still log a warning message in the logs. 615 erp.logger.Warningf("Failed to set the MySQL replication source during ERS but because there is only one other tablet we assume it is the one that had failed and will progress with the reparent. Error: %v", rec.Error()) 616 return nil, nil 617 } 618 return nil, vterrors.Wrapf(rec.Error(), "%d replica(s) failed: %v", numReplicas, rec.Error()) 619 default: 620 return replicasStartedReplication, nil 621 } 622 } 623 624 } 625 626 // isIntermediateSourceIdeal is used to find whether the intermediate source that ERS chose is also the ideal one or not 627 func (erp *EmergencyReparenter) isIntermediateSourceIdeal( 628 intermediateSource *topodatapb.Tablet, 629 validCandidates []*topodatapb.Tablet, 630 tabletMap map[string]*topo.TabletInfo, 631 opts EmergencyReparentOptions, 632 ) (bool, error) { 633 // we try to find a better candidate with the current list of valid candidates, and if it matches our current primary candidate, then we return true 634 candidate, err := erp.identifyPrimaryCandidate(intermediateSource, validCandidates, tabletMap, opts) 635 if err != nil { 636 return false, err 637 } 638 return candidate == intermediateSource, nil 639 } 640 641 // identifyPrimaryCandidate is used to find the final candidate for ERS promotion 642 func (erp *EmergencyReparenter) identifyPrimaryCandidate( 643 intermediateSource *topodatapb.Tablet, 644 validCandidates []*topodatapb.Tablet, 645 tabletMap map[string]*topo.TabletInfo, 646 opts EmergencyReparentOptions, 647 ) (candidate *topodatapb.Tablet, err error) { 648 defer func() { 649 if candidate != nil { 650 erp.logger.Infof("found better candidate - %v", candidate.Alias) 651 } 652 }() 653 654 if len(validCandidates) == 0 { 655 return nil, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "no valid candidates for emergency reparent") 656 } 657 658 if opts.NewPrimaryAlias != nil { 659 // explicit request to promote a specific tablet 660 requestedPrimaryAlias := topoproto.TabletAliasString(opts.NewPrimaryAlias) 661 requestedPrimaryInfo, isFound := tabletMap[requestedPrimaryAlias] 662 if !isFound { 663 return nil, vterrors.Errorf(vtrpc.Code_INTERNAL, "candidate %v not found in the tablet map; this an impossible situation", requestedPrimaryAlias) 664 } 665 if topoproto.IsTabletInList(requestedPrimaryInfo.Tablet, validCandidates) { 666 return requestedPrimaryInfo.Tablet, nil 667 } 668 return nil, vterrors.Errorf(vtrpc.Code_ABORTED, "requested candidate %v is not in valid candidates list", requestedPrimaryAlias) 669 } 670 671 // We have already selected an intermediate source which was selected based on the replication position 672 // (ties broken by promotion rules), but that tablet might not even be a valid candidate i.e. it could 673 // be in a different cell when we have PreventCrossCellPromotion specified, or it could have a promotion rule of 674 // MustNot. Even if it is valid, there could be a tablet with a better promotion rule. This is what we try to 675 // find here. 676 // We go over all the promotion rules in descending order of priority and try and find a valid candidate with 677 // that promotion rule. 678 // If the intermediate source has the same promotion rules as some other tablets, then we prioritize using 679 // the intermediate source since we won't have to wait for the new candidate to catch up! 680 for _, promotionRule := range promotionrule.AllPromotionRules() { 681 candidates := getTabletsWithPromotionRules(opts.durability, validCandidates, promotionRule) 682 candidate = findCandidate(intermediateSource, candidates) 683 if candidate != nil { 684 return candidate, nil 685 } 686 } 687 // Unreachable code. 688 // We should have found atleast 1 tablet in the valid list. 689 // If the list is empty, then we should have errored out much sooner. 690 return nil, vterrors.Errorf(vtrpc.Code_INTERNAL, "unreachable - did not find a valid primary candidate even though the valid candidate list was non-empty") 691 } 692 693 func (erp *EmergencyReparenter) promoteNewPrimary( 694 ctx context.Context, 695 ev *events.Reparent, 696 newPrimary *topodatapb.Tablet, 697 opts EmergencyReparentOptions, 698 tabletMap map[string]*topo.TabletInfo, 699 statusMap map[string]*replicationdatapb.StopReplicationStatus, 700 ) error { 701 var err error 702 if ev.ShardInfo.PrimaryAlias == nil { 703 erp.logger.Infof("setting up %v as new primary for an uninitialized cluster", newPrimary.Alias) 704 // we call InitPrimary when the PrimaryAlias in the ShardInfo is empty. This happens when we have an uninitialized cluster. 705 _, err = erp.tmc.InitPrimary(ctx, newPrimary, SemiSyncAckers(opts.durability, newPrimary) > 0) 706 } else { 707 erp.logger.Infof("starting promotion for the new primary - %v", newPrimary.Alias) 708 // we call PromoteReplica which changes the tablet type, fixes the semi-sync, set the primary to read-write and flushes the binlogs 709 _, err = erp.tmc.PromoteReplica(ctx, newPrimary, SemiSyncAckers(opts.durability, newPrimary) > 0) 710 } 711 if err != nil { 712 return vterrors.Wrapf(err, "primary-elect tablet %v failed to be upgraded to primary: %v", newPrimary.Alias, err) 713 } 714 // we now reparent all the replicas to the new primary we have promoted. 715 // Here we do not need to wait for all the replicas, We can finish early when even 1 succeeds. 716 _, err = erp.reparentReplicas(ctx, ev, newPrimary, tabletMap, statusMap, opts, false /* waitForAllReplicas */, true /* populateReparentJournal */) 717 if err != nil { 718 return err 719 } 720 return nil 721 } 722 723 // filterValidCandidates filters valid tablets, keeping only the ones which can successfully be promoted without any constraint failures and can make forward progress on being promoted 724 func (erp *EmergencyReparenter) filterValidCandidates(validTablets []*topodatapb.Tablet, tabletsReachable []*topodatapb.Tablet, prevPrimary *topodatapb.Tablet, opts EmergencyReparentOptions) ([]*topodatapb.Tablet, error) { 725 var restrictedValidTablets []*topodatapb.Tablet 726 for _, tablet := range validTablets { 727 tabletAliasStr := topoproto.TabletAliasString(tablet.Alias) 728 // Remove tablets which have MustNot promote rule since they must never be promoted 729 if PromotionRule(opts.durability, tablet) == promotionrule.MustNot { 730 erp.logger.Infof("Removing %s from list of valid candidates for promotion because it has the Must Not promote rule", tabletAliasStr) 731 if opts.NewPrimaryAlias != nil && topoproto.TabletAliasEqual(opts.NewPrimaryAlias, tablet.Alias) { 732 return nil, vterrors.Errorf(vtrpc.Code_ABORTED, "proposed primary %s has a must not promotion rule", topoproto.TabletAliasString(opts.NewPrimaryAlias)) 733 } 734 continue 735 } 736 // If ERS is configured to prevent cross cell promotions, remove any tablet not from the same cell as the previous primary 737 if opts.PreventCrossCellPromotion && prevPrimary != nil && tablet.Alias.Cell != prevPrimary.Alias.Cell { 738 erp.logger.Infof("Removing %s from list of valid candidates for promotion because it isn't in the same cell as the previous primary", tabletAliasStr) 739 if opts.NewPrimaryAlias != nil && topoproto.TabletAliasEqual(opts.NewPrimaryAlias, tablet.Alias) { 740 return nil, vterrors.Errorf(vtrpc.Code_ABORTED, "proposed primary %s is is a different cell as the previous primary", topoproto.TabletAliasString(opts.NewPrimaryAlias)) 741 } 742 continue 743 } 744 // Remove any tablet which cannot make forward progress using the list of tablets we have reached 745 if !canEstablishForTablet(opts.durability, tablet, tabletsReachable) { 746 erp.logger.Infof("Removing %s from list of valid candidates for promotion because it will not be able to make forward progress on promotion with the tablets currently reachable", tabletAliasStr) 747 if opts.NewPrimaryAlias != nil && topoproto.TabletAliasEqual(opts.NewPrimaryAlias, tablet.Alias) { 748 return nil, vterrors.Errorf(vtrpc.Code_ABORTED, "proposed primary %s will not be able to make forward progress on being promoted", topoproto.TabletAliasString(opts.NewPrimaryAlias)) 749 } 750 continue 751 } 752 restrictedValidTablets = append(restrictedValidTablets, tablet) 753 } 754 return restrictedValidTablets, nil 755 }