vitess.io/vitess@v0.16.2/go/vt/vtctl/reparentutil/emergency_reparenter.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8  	http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package reparentutil
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sync"
    23  	"time"
    24  
    25  	"google.golang.org/protobuf/proto"
    26  
    27  	"k8s.io/apimachinery/pkg/util/sets"
    28  
    29  	"vitess.io/vitess/go/event"
    30  	"vitess.io/vitess/go/mysql"
    31  	"vitess.io/vitess/go/stats"
    32  	"vitess.io/vitess/go/vt/concurrency"
    33  	"vitess.io/vitess/go/vt/logutil"
    34  	"vitess.io/vitess/go/vt/topo"
    35  	"vitess.io/vitess/go/vt/topo/topoproto"
    36  	"vitess.io/vitess/go/vt/topotools/events"
    37  	"vitess.io/vitess/go/vt/vtctl/reparentutil/promotionrule"
    38  	"vitess.io/vitess/go/vt/vterrors"
    39  	"vitess.io/vitess/go/vt/vttablet/tmclient"
    40  
    41  	logutilpb "vitess.io/vitess/go/vt/proto/logutil"
    42  	replicationdatapb "vitess.io/vitess/go/vt/proto/replicationdata"
    43  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    44  	"vitess.io/vitess/go/vt/proto/vtrpc"
    45  )
    46  
    47  // EmergencyReparenter performs EmergencyReparentShard operations.
    48  type EmergencyReparenter struct {
    49  	ts     *topo.Server
    50  	tmc    tmclient.TabletManagerClient
    51  	logger logutil.Logger
    52  }
    53  
    54  // EmergencyReparentOptions provides optional parameters to
    55  // EmergencyReparentShard operations. Options are passed by value, so it is safe
    56  // for callers to mutate and reuse options structs for multiple calls.
    57  type EmergencyReparentOptions struct {
    58  	NewPrimaryAlias           *topodatapb.TabletAlias
    59  	IgnoreReplicas            sets.Set[string]
    60  	WaitReplicasTimeout       time.Duration
    61  	PreventCrossCellPromotion bool
    62  
    63  	// Private options managed internally. We use value passing to avoid leaking
    64  	// these details back out.
    65  	lockAction string
    66  	durability Durabler
    67  }
    68  
    69  // counters for Emergency Reparent Shard
    70  var (
    71  	ersCounter        = stats.NewGauge("ers_counter", "Number of times Emergency Reparent Shard has been run")
    72  	ersSuccessCounter = stats.NewGauge("ers_success_counter", "Number of times Emergency Reparent Shard has succeeded")
    73  	ersFailureCounter = stats.NewGauge("ers_failure_counter", "Number of times Emergency Reparent Shard has failed")
    74  )
    75  
    76  // NewEmergencyReparenter returns a new EmergencyReparenter object, ready to
    77  // perform EmergencyReparentShard operations using the given topo.Server,
    78  // TabletManagerClient, and logger.
    79  //
    80  // Providing a nil logger instance is allowed.
    81  func NewEmergencyReparenter(ts *topo.Server, tmc tmclient.TabletManagerClient, logger logutil.Logger) *EmergencyReparenter {
    82  	erp := EmergencyReparenter{
    83  		ts:     ts,
    84  		tmc:    tmc,
    85  		logger: logger,
    86  	}
    87  
    88  	if erp.logger == nil {
    89  		// Create a no-op logger so we can call functions on er.logger without
    90  		// needed to constantly check for non-nil.
    91  		erp.logger = logutil.NewCallbackLogger(func(*logutilpb.Event) {})
    92  	}
    93  
    94  	return &erp
    95  }
    96  
    97  // ReparentShard performs the EmergencyReparentShard operation on the given
    98  // keyspace and shard.
    99  func (erp *EmergencyReparenter) ReparentShard(ctx context.Context, keyspace string, shard string, opts EmergencyReparentOptions) (*events.Reparent, error) {
   100  	var err error
   101  	// First step is to lock the shard for the given operation, if not already locked
   102  	if err = topo.CheckShardLocked(ctx, keyspace, shard); err != nil {
   103  		var unlock func(*error)
   104  		opts.lockAction = erp.getLockAction(opts.NewPrimaryAlias)
   105  		ctx, unlock, err = erp.ts.LockShard(ctx, keyspace, shard, opts.lockAction)
   106  		if err != nil {
   107  			return nil, err
   108  		}
   109  		defer unlock(&err)
   110  	}
   111  
   112  	// dispatch success or failure of ERS
   113  	ev := &events.Reparent{}
   114  	defer func() {
   115  		switch err {
   116  		case nil:
   117  			ersSuccessCounter.Add(1)
   118  			event.DispatchUpdate(ev, "finished EmergencyReparentShard")
   119  		default:
   120  			ersFailureCounter.Add(1)
   121  			event.DispatchUpdate(ev, "failed EmergencyReparentShard: "+err.Error())
   122  		}
   123  	}()
   124  
   125  	err = erp.reparentShardLocked(ctx, ev, keyspace, shard, opts)
   126  
   127  	return ev, err
   128  }
   129  
   130  func (erp *EmergencyReparenter) getLockAction(newPrimaryAlias *topodatapb.TabletAlias) string {
   131  	action := "EmergencyReparentShard"
   132  
   133  	if newPrimaryAlias != nil {
   134  		action += fmt.Sprintf("(%v)", topoproto.TabletAliasString(newPrimaryAlias))
   135  	}
   136  
   137  	return action
   138  }
   139  
   140  // reparentShardLocked performs Emergency Reparent Shard operation assuming that the shard is already locked
   141  func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *events.Reparent, keyspace, shard string, opts EmergencyReparentOptions) (err error) {
   142  	// log the starting of the operation and increment the counter
   143  	erp.logger.Infof("will initiate emergency reparent shard in keyspace - %s, shard - %s", keyspace, shard)
   144  	ersCounter.Add(1)
   145  
   146  	var (
   147  		stoppedReplicationSnapshot *replicationSnapshot
   148  		shardInfo                  *topo.ShardInfo
   149  		prevPrimary                *topodatapb.Tablet
   150  		tabletMap                  map[string]*topo.TabletInfo
   151  		validCandidates            map[string]mysql.Position
   152  		intermediateSource         *topodatapb.Tablet
   153  		validCandidateTablets      []*topodatapb.Tablet
   154  		validReplacementCandidates []*topodatapb.Tablet
   155  		betterCandidate            *topodatapb.Tablet
   156  		isIdeal                    bool
   157  	)
   158  
   159  	shardInfo, err = erp.ts.GetShard(ctx, keyspace, shard)
   160  	if err != nil {
   161  		return err
   162  	}
   163  	ev.ShardInfo = *shardInfo
   164  
   165  	keyspaceDurability, err := erp.ts.GetKeyspaceDurability(ctx, keyspace)
   166  	if err != nil {
   167  		return err
   168  	}
   169  
   170  	erp.logger.Infof("Getting a new durability policy for %v", keyspaceDurability)
   171  	opts.durability, err = GetDurabilityPolicy(keyspaceDurability)
   172  	if err != nil {
   173  		return err
   174  	}
   175  
   176  	// get the previous primary according to the topology server,
   177  	// we use this information to choose the best candidate in the same cell
   178  	// and to undo promotion in case of failure
   179  	if shardInfo.PrimaryAlias != nil {
   180  		prevPrimaryInfo, err := erp.ts.GetTablet(ctx, shardInfo.PrimaryAlias)
   181  		if err != nil {
   182  			return err
   183  		}
   184  		prevPrimary = prevPrimaryInfo.Tablet
   185  	}
   186  
   187  	// read all the tablets and their information
   188  	event.DispatchUpdate(ev, "reading all tablets")
   189  	tabletMap, err = erp.ts.GetTabletMapForShard(ctx, keyspace, shard)
   190  	if err != nil {
   191  		return vterrors.Wrapf(err, "failed to get tablet map for %v/%v: %v", keyspace, shard, err)
   192  	}
   193  
   194  	// Stop replication on all the tablets and build their status map
   195  	stoppedReplicationSnapshot, err = stopReplicationAndBuildStatusMaps(ctx, erp.tmc, ev, tabletMap, topo.RemoteOperationTimeout, opts.IgnoreReplicas, opts.NewPrimaryAlias, opts.durability, erp.logger)
   196  	if err != nil {
   197  		return vterrors.Wrapf(err, "failed to stop replication and build status maps: %v", err)
   198  	}
   199  
   200  	// check that we still have the shard lock. If we don't then we can terminate at this point
   201  	if err := topo.CheckShardLocked(ctx, keyspace, shard); err != nil {
   202  		return vterrors.Wrapf(err, "lost topology lock, aborting: %v", err)
   203  	}
   204  
   205  	// find the valid candidates for becoming the primary
   206  	// this is where we check for errant GTIDs and remove the tablets that have them from consideration
   207  	validCandidates, err = FindValidEmergencyReparentCandidates(stoppedReplicationSnapshot.statusMap, stoppedReplicationSnapshot.primaryStatusMap)
   208  	if err != nil {
   209  		return err
   210  	}
   211  	// Restrict the valid candidates list. We remove any tablet which is of the type DRAINED, RESTORE or BACKUP.
   212  	validCandidates, err = restrictValidCandidates(validCandidates, tabletMap)
   213  	if err != nil {
   214  		return err
   215  	} else if len(validCandidates) == 0 {
   216  		return vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "no valid candidates for emergency reparent")
   217  	}
   218  
   219  	// Wait for all candidates to apply relay logs
   220  	if err = erp.waitForAllRelayLogsToApply(ctx, validCandidates, tabletMap, stoppedReplicationSnapshot.statusMap, opts.WaitReplicasTimeout); err != nil {
   221  		return err
   222  	}
   223  
   224  	// Find the intermediate source for replication that we want other tablets to replicate from.
   225  	// This step chooses the most advanced tablet. Further ties are broken by using the promotion rule.
   226  	// In case the user has specified a tablet specifically, then it is selected, as long as it is the most advanced.
   227  	// Here we also check for split brain scenarios and check that the selected replica must be more advanced than all the other valid candidates.
   228  	// We fail in case there is a split brain detected.
   229  	// The validCandidateTablets list is sorted by the replication positions with ties broken by promotion rules.
   230  	intermediateSource, validCandidateTablets, err = erp.findMostAdvanced(validCandidates, tabletMap, opts)
   231  	if err != nil {
   232  		return err
   233  	}
   234  	erp.logger.Infof("intermediate source selected - %v", intermediateSource.Alias)
   235  
   236  	// After finding the intermediate source, we want to filter the valid candidate list by the following criteria -
   237  	// 1. Only keep the tablets which can make progress after being promoted (have sufficient reachable semi-sync ackers)
   238  	// 2. Remove the tablets with the Must_not promote rule
   239  	// 3. Remove cross-cell tablets if PreventCrossCellPromotion is specified
   240  	// Our final primary candidate MUST belong to this list of valid candidates
   241  	validCandidateTablets, err = erp.filterValidCandidates(validCandidateTablets, stoppedReplicationSnapshot.reachableTablets, prevPrimary, opts)
   242  	if err != nil {
   243  		return err
   244  	}
   245  
   246  	// Check whether the intermediate source candidate selected is ideal or if it can be improved later.
   247  	// If the intermediateSource is ideal, then we can be certain that it is part of the valid candidates list.
   248  	isIdeal, err = erp.isIntermediateSourceIdeal(intermediateSource, validCandidateTablets, tabletMap, opts)
   249  	if err != nil {
   250  		return err
   251  	}
   252  	erp.logger.Infof("intermediate source is ideal candidate- %v", isIdeal)
   253  
   254  	// Check (again) we still have the topology lock.
   255  	if err = topo.CheckShardLocked(ctx, keyspace, shard); err != nil {
   256  		return vterrors.Wrapf(err, "lost topology lock, aborting: %v", err)
   257  	}
   258  
   259  	// initialize the newPrimary with the intermediate source, override this value if it is not the ideal candidate
   260  	newPrimary := intermediateSource
   261  	if !isIdeal {
   262  		// we now reparent all the tablets to start replicating from the intermediate source
   263  		// we do not promote the tablet or change the shard record. We only change the replication for all the other tablets
   264  		// it also returns the list of the tablets that started replication successfully including itself part of the validCandidateTablets list.
   265  		// These are the candidates that we can use to find a replacement.
   266  		validReplacementCandidates, err = erp.promoteIntermediateSource(ctx, ev, intermediateSource, tabletMap, stoppedReplicationSnapshot.statusMap, validCandidateTablets, opts)
   267  		if err != nil {
   268  			return err
   269  		}
   270  
   271  		// try to find a better candidate using the list we got back
   272  		// We prefer to choose a candidate which is in the same cell as our previous primary and of the best possible durability rule.
   273  		// However, if there is an explicit request from the user to promote a specific tablet, then we choose that tablet.
   274  		betterCandidate, err = erp.identifyPrimaryCandidate(intermediateSource, validReplacementCandidates, tabletMap, opts)
   275  		if err != nil {
   276  			return err
   277  		}
   278  
   279  		// if our better candidate is different from our intermediate source, then we wait for it to catch up to the intermediate source
   280  		if !topoproto.TabletAliasEqual(betterCandidate.Alias, intermediateSource.Alias) {
   281  			err = waitForCatchUp(ctx, erp.tmc, erp.logger, betterCandidate, intermediateSource, opts.WaitReplicasTimeout)
   282  			if err != nil {
   283  				return err
   284  			}
   285  			newPrimary = betterCandidate
   286  		}
   287  	}
   288  
   289  	// The new primary which will be promoted will always belong to the validCandidateTablets list because -
   290  	// 	1. 	if the intermediate source is ideal - then we know the intermediate source was in the validCandidateTablets list
   291  	// 		since we used that list
   292  	//	2. 	if the intermediate source isn't ideal - we take the intersection of the validCandidateTablets list and the one we
   293  	//		were able to reach during the promotion of intermediate source, as possible candidates. So the final candidate (even if
   294  	//		it is the intermediate source itself) will belong to the list
   295  	// Since the new primary tablet belongs to the validCandidateTablets list, we no longer need any additional constraint checks
   296  
   297  	// Final step is to promote our primary candidate
   298  	err = erp.promoteNewPrimary(ctx, ev, newPrimary, opts, tabletMap, stoppedReplicationSnapshot.statusMap)
   299  	if err != nil {
   300  		return err
   301  	}
   302  
   303  	ev.NewPrimary = proto.Clone(newPrimary).(*topodatapb.Tablet)
   304  	return err
   305  }
   306  
   307  func (erp *EmergencyReparenter) waitForAllRelayLogsToApply(
   308  	ctx context.Context,
   309  	validCandidates map[string]mysql.Position,
   310  	tabletMap map[string]*topo.TabletInfo,
   311  	statusMap map[string]*replicationdatapb.StopReplicationStatus,
   312  	waitReplicasTimeout time.Duration,
   313  ) error {
   314  	errCh := make(chan concurrency.Error)
   315  	defer close(errCh)
   316  
   317  	groupCtx, groupCancel := context.WithTimeout(ctx, waitReplicasTimeout)
   318  	defer groupCancel()
   319  
   320  	waiterCount := 0
   321  
   322  	for candidate := range validCandidates {
   323  		// When we called stopReplicationAndBuildStatusMaps, we got back two
   324  		// maps: (1) the StopReplicationStatus of any replicas that actually
   325  		// stopped replication; and (2) the PrimaryStatus of anything that
   326  		// returned ErrNotReplica, which is a tablet that is either the current
   327  		// primary or is stuck thinking it is a PRIMARY but is not in actuality.
   328  		//
   329  		// If we have a tablet in the validCandidates map that does not appear
   330  		// in the statusMap, then we have either (a) the current primary, which
   331  		// is not replicating, so it is not applying relay logs; or (b) a tablet
   332  		// that is stuck thinking it is PRIMARY but is not in actuality. In that
   333  		// second case - (b) - we will most likely find that the stuck PRIMARY
   334  		// does not have a winning position, and fail the ERS. If, on the other
   335  		// hand, it does have a winning position, we are trusting the operator
   336  		// to know what they are doing by emergency-reparenting onto that
   337  		// tablet. In either case, it does not make sense to wait for relay logs
   338  		// to apply on a tablet that was never applying relay logs in the first
   339  		// place, so we skip it, and log that we did.
   340  		status, ok := statusMap[candidate]
   341  		if !ok {
   342  			erp.logger.Infof("EmergencyReparent candidate %v not in replica status map; this means it was not running replication (because it was formerly PRIMARY), so skipping WaitForRelayLogsToApply step for this candidate", candidate)
   343  			continue
   344  		}
   345  
   346  		go func(alias string, status *replicationdatapb.StopReplicationStatus) {
   347  			var err error
   348  			defer func() {
   349  				errCh <- concurrency.Error{
   350  					Err: err,
   351  				}
   352  			}()
   353  			err = WaitForRelayLogsToApply(groupCtx, erp.tmc, tabletMap[alias], status)
   354  		}(candidate, status)
   355  
   356  		waiterCount++
   357  	}
   358  
   359  	errgroup := concurrency.ErrorGroup{
   360  		NumGoroutines:        waiterCount,
   361  		NumRequiredSuccesses: waiterCount,
   362  		NumAllowedErrors:     0,
   363  	}
   364  	rec := errgroup.Wait(groupCancel, errCh)
   365  
   366  	if len(rec.Errors) != 0 {
   367  		return vterrors.Wrapf(rec.Error(), "could not apply all relay logs within the provided waitReplicasTimeout (%s): %v", waitReplicasTimeout, rec.Error())
   368  	}
   369  
   370  	return nil
   371  }
   372  
   373  // findMostAdvanced finds the intermediate source for ERS. We always choose the most advanced one from our valid candidates list. Further ties are broken by looking at the promotion rules.
   374  func (erp *EmergencyReparenter) findMostAdvanced(
   375  	validCandidates map[string]mysql.Position,
   376  	tabletMap map[string]*topo.TabletInfo,
   377  	opts EmergencyReparentOptions,
   378  ) (*topodatapb.Tablet, []*topodatapb.Tablet, error) {
   379  	erp.logger.Infof("started finding the intermediate source")
   380  	// convert the valid candidates into a list so that we can use it for sorting
   381  	validTablets, tabletPositions, err := getValidCandidatesAndPositionsAsList(validCandidates, tabletMap)
   382  	if err != nil {
   383  		return nil, nil, err
   384  	}
   385  
   386  	// sort the tablets for finding the best intermediate source in ERS
   387  	err = sortTabletsForReparent(validTablets, tabletPositions, opts.durability)
   388  	if err != nil {
   389  		return nil, nil, err
   390  	}
   391  	for _, tablet := range validTablets {
   392  		erp.logger.Infof("finding intermediate source - sorted replica: %v", tablet.Alias)
   393  	}
   394  
   395  	// The first tablet in the sorted list will be the most eligible candidate unless explicitly asked for some other tablet
   396  	winningPrimaryTablet := validTablets[0]
   397  	winningPosition := tabletPositions[0]
   398  
   399  	// We have already removed the tablets with errant GTIDs before calling this function. At this point our winning position must be a
   400  	// superset of all the other valid positions. If that is not the case, then we have a split brain scenario, and we should cancel the ERS
   401  	for i, position := range tabletPositions {
   402  		if !winningPosition.AtLeast(position) {
   403  			return nil, nil, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "split brain detected between servers - %v and %v", winningPrimaryTablet.Alias, validTablets[i].Alias)
   404  		}
   405  	}
   406  
   407  	// If we were requested to elect a particular primary, verify it's a valid
   408  	// candidate (non-zero position, no errant GTIDs)
   409  	if opts.NewPrimaryAlias != nil {
   410  		requestedPrimaryAlias := topoproto.TabletAliasString(opts.NewPrimaryAlias)
   411  		pos, ok := validCandidates[requestedPrimaryAlias]
   412  		if !ok {
   413  			return nil, nil, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "requested primary elect %v has errant GTIDs", requestedPrimaryAlias)
   414  		}
   415  		// if the requested tablet is as advanced as the most advanced tablet, then we can just use it for promotion.
   416  		// otherwise, we should let it catchup to the most advanced tablet and not change the intermediate source
   417  		if pos.AtLeast(winningPosition) {
   418  			requestedPrimaryInfo, isFound := tabletMap[requestedPrimaryAlias]
   419  			if !isFound {
   420  				return nil, nil, vterrors.Errorf(vtrpc.Code_INTERNAL, "candidate %v not found in the tablet map; this an impossible situation", requestedPrimaryAlias)
   421  			}
   422  			winningPrimaryTablet = requestedPrimaryInfo.Tablet
   423  		}
   424  	}
   425  
   426  	return winningPrimaryTablet, validTablets, nil
   427  }
   428  
   429  // promoteIntermediateSource reparents all the other tablets to start replicating from the intermediate source.
   430  // It does not promote this tablet to a primary instance, we only let other replicas start replicating from this tablet
   431  func (erp *EmergencyReparenter) promoteIntermediateSource(
   432  	ctx context.Context,
   433  	ev *events.Reparent,
   434  	source *topodatapb.Tablet,
   435  	tabletMap map[string]*topo.TabletInfo,
   436  	statusMap map[string]*replicationdatapb.StopReplicationStatus,
   437  	validCandidateTablets []*topodatapb.Tablet,
   438  	opts EmergencyReparentOptions,
   439  ) ([]*topodatapb.Tablet, error) {
   440  	// we reparent all the other tablets to start replication from our new source
   441  	// we wait for all the replicas so that we can choose a better candidate from the ones that started replication later
   442  	reachableTablets, err := erp.reparentReplicas(ctx, ev, source, tabletMap, statusMap, opts, true /* waitForAllReplicas */, false /* populateReparentJournal */)
   443  	if err != nil {
   444  		return nil, err
   445  	}
   446  
   447  	// also include the current tablet for being considered as part of valid candidates for ERS promotion
   448  	reachableTablets = append(reachableTablets, source)
   449  
   450  	// The only valid candidates for improvement are the ones which are reachable and part of the valid candidate list.
   451  	// Here we need to be careful not to mess up the ordering of tablets in validCandidateTablets, since the list is sorted by the
   452  	// replication positions.
   453  	var validCandidatesForImprovement []*topodatapb.Tablet
   454  	for _, tablet := range validCandidateTablets {
   455  		if topoproto.IsTabletInList(tablet, reachableTablets) {
   456  			validCandidatesForImprovement = append(validCandidatesForImprovement, tablet)
   457  		}
   458  	}
   459  	return validCandidatesForImprovement, nil
   460  }
   461  
   462  // reparentReplicas reparents all the replicas provided and populates the reparent journal on the primary if asked.
   463  // Also, it returns the replicas which started replicating only in the case where we wait for all the replicas
   464  func (erp *EmergencyReparenter) reparentReplicas(
   465  	ctx context.Context,
   466  	ev *events.Reparent,
   467  	newPrimaryTablet *topodatapb.Tablet,
   468  	tabletMap map[string]*topo.TabletInfo,
   469  	statusMap map[string]*replicationdatapb.StopReplicationStatus,
   470  	opts EmergencyReparentOptions,
   471  	waitForAllReplicas bool,
   472  	populateReparentJournal bool,
   473  ) ([]*topodatapb.Tablet, error) {
   474  
   475  	var (
   476  		replicasStartedReplication []*topodatapb.Tablet
   477  		replicaMutex               sync.Mutex
   478  	)
   479  
   480  	replCtx, replCancel := context.WithTimeout(context.Background(), opts.WaitReplicasTimeout)
   481  
   482  	event.DispatchUpdate(ev, "reparenting all tablets")
   483  
   484  	// Create a context and cancel function to watch for the first successful
   485  	// SetReplicationSource call on a replica. We use a background context so that this
   486  	// context is only ever Done when its cancel is called by the background
   487  	// goroutine we're about to spin up.
   488  	//
   489  	// Similarly, create a context and cancel for the replica waiter goroutine
   490  	// to signal when all replica goroutines have finished. In the case where at
   491  	// least one replica succeeds, replSuccessCtx will be canceled first, while
   492  	// allReplicasDoneCtx is guaranteed to be canceled within
   493  	// opts.WaitReplicasTimeout plus some jitter.
   494  	replSuccessCtx, replSuccessCancel := context.WithCancel(context.Background())
   495  	allReplicasDoneCtx, allReplicasDoneCancel := context.WithCancel(context.Background())
   496  
   497  	now := time.Now().UnixNano()
   498  	replWg := sync.WaitGroup{}
   499  	rec := concurrency.AllErrorRecorder{}
   500  
   501  	handlePrimary := func(alias string, tablet *topodatapb.Tablet) error {
   502  		position, err := erp.tmc.PrimaryPosition(replCtx, tablet)
   503  		if err != nil {
   504  			return err
   505  		}
   506  		if populateReparentJournal {
   507  			erp.logger.Infof("populating reparent journal on new primary %v", alias)
   508  			return erp.tmc.PopulateReparentJournal(replCtx, tablet, now, opts.lockAction, newPrimaryTablet.Alias, position)
   509  		}
   510  		return nil
   511  	}
   512  
   513  	handleReplica := func(alias string, ti *topo.TabletInfo) {
   514  		defer replWg.Done()
   515  		erp.logger.Infof("setting new primary on replica %v", alias)
   516  
   517  		forceStart := false
   518  		if status, ok := statusMap[alias]; ok {
   519  			fs, err := ReplicaWasRunning(status)
   520  			if err != nil {
   521  				err = vterrors.Wrapf(err, "tablet %v could not determine StopReplicationStatus: %v", alias, err)
   522  				rec.RecordError(err)
   523  
   524  				return
   525  			}
   526  
   527  			forceStart = fs
   528  		}
   529  
   530  		err := erp.tmc.SetReplicationSource(replCtx, ti.Tablet, newPrimaryTablet.Alias, 0, "", forceStart, IsReplicaSemiSync(opts.durability, newPrimaryTablet, ti.Tablet))
   531  		if err != nil {
   532  			err = vterrors.Wrapf(err, "tablet %v SetReplicationSource failed: %v", alias, err)
   533  			rec.RecordError(err)
   534  
   535  			return
   536  		}
   537  
   538  		replicaMutex.Lock()
   539  		replicasStartedReplication = append(replicasStartedReplication, ti.Tablet)
   540  		replicaMutex.Unlock()
   541  
   542  		// Signal that at least one goroutine succeeded to SetReplicationSource.
   543  		// We do this only when we do not want to wait for all the replicas
   544  		if !waitForAllReplicas {
   545  			replSuccessCancel()
   546  		}
   547  	}
   548  
   549  	numReplicas := 0
   550  
   551  	for alias, ti := range tabletMap {
   552  		switch {
   553  		case alias == topoproto.TabletAliasString(newPrimaryTablet.Alias):
   554  			continue
   555  		case !opts.IgnoreReplicas.Has(alias):
   556  			replWg.Add(1)
   557  			numReplicas++
   558  			go handleReplica(alias, ti)
   559  		}
   560  	}
   561  
   562  	// Spin up a background goroutine to wait until all replica goroutines
   563  	// finished. Polling this way allows us to have reparentReplicas return
   564  	// success as soon as (a) the primary successfully populates its reparent
   565  	// journal and (b) at least one replica successfully begins replicating.
   566  	//
   567  	// If we were to follow the more common pattern of blocking on replWg.Wait()
   568  	// in the main body of promoteNewPrimary, we would be bound to the
   569  	// time of slowest replica, instead of the time of the fastest successful
   570  	// replica, and we want ERS to be fast.
   571  	go func() {
   572  		replWg.Wait()
   573  		allReplicasDoneCancel()
   574  	}()
   575  
   576  	primaryErr := handlePrimary(topoproto.TabletAliasString(newPrimaryTablet.Alias), newPrimaryTablet)
   577  	if primaryErr != nil {
   578  		erp.logger.Warningf("primary failed to PopulateReparentJournal")
   579  		replCancel()
   580  
   581  		return nil, vterrors.Wrapf(primaryErr, "failed to PopulateReparentJournal on primary: %v", primaryErr)
   582  	}
   583  
   584  	// We should only cancel the context that all the replicas are using when they are done.
   585  	// Since this function can return early when only 1 replica succeeds, if we cancel this context as a deferred call from this function,
   586  	// then we would end up having cancelled the context for the replicas who have not yet finished running all the commands.
   587  	// This leads to some replicas not starting replication properly. So we must wait for all the replicas to finish before cancelling this context.
   588  	go func() {
   589  		replWg.Wait()
   590  		defer replCancel()
   591  	}()
   592  
   593  	select {
   594  	case <-replSuccessCtx.Done():
   595  		// At least one replica was able to SetReplicationSource successfully
   596  		// Here we do not need to return the replicas which started replicating
   597  		return nil, nil
   598  	case <-allReplicasDoneCtx.Done():
   599  		// There are certain timing issues between replSuccessCtx.Done firing
   600  		// and allReplicasDoneCtx.Done firing, so we check again if truly all
   601  		// replicas failed (where `numReplicas` goroutines recorded an error) or
   602  		// one or more actually managed to succeed.
   603  		errCount := len(rec.Errors)
   604  
   605  		switch {
   606  		case errCount > numReplicas:
   607  			// Technically, rec.Errors should never be greater than numReplicas,
   608  			// but it's better to err on the side of caution here, but also
   609  			// we're going to be explicit that this is doubly unexpected.
   610  			return nil, vterrors.Wrapf(rec.Error(), "received more errors (= %d) than replicas (= %d), which should be impossible: %v", errCount, numReplicas, rec.Error())
   611  		case errCount == numReplicas:
   612  			if len(tabletMap) <= 2 {
   613  				// If there are at most 2 tablets in the tablet map, we shouldn't be failing the promotion if the replica fails to SetReplicationSource.
   614  				// The failing replica is probably the old primary that is down, so it is okay if it fails. We still log a warning message in the logs.
   615  				erp.logger.Warningf("Failed to set the MySQL replication source during ERS but because there is only one other tablet we assume it is the one that had failed and will progress with the reparent. Error: %v", rec.Error())
   616  				return nil, nil
   617  			}
   618  			return nil, vterrors.Wrapf(rec.Error(), "%d replica(s) failed: %v", numReplicas, rec.Error())
   619  		default:
   620  			return replicasStartedReplication, nil
   621  		}
   622  	}
   623  
   624  }
   625  
   626  // isIntermediateSourceIdeal is used to find whether the intermediate source that ERS chose is also the ideal one or not
   627  func (erp *EmergencyReparenter) isIntermediateSourceIdeal(
   628  	intermediateSource *topodatapb.Tablet,
   629  	validCandidates []*topodatapb.Tablet,
   630  	tabletMap map[string]*topo.TabletInfo,
   631  	opts EmergencyReparentOptions,
   632  ) (bool, error) {
   633  	// we try to find a better candidate with the current list of valid candidates, and if it matches our current primary candidate, then we return true
   634  	candidate, err := erp.identifyPrimaryCandidate(intermediateSource, validCandidates, tabletMap, opts)
   635  	if err != nil {
   636  		return false, err
   637  	}
   638  	return candidate == intermediateSource, nil
   639  }
   640  
   641  // identifyPrimaryCandidate is used to find the final candidate for ERS promotion
   642  func (erp *EmergencyReparenter) identifyPrimaryCandidate(
   643  	intermediateSource *topodatapb.Tablet,
   644  	validCandidates []*topodatapb.Tablet,
   645  	tabletMap map[string]*topo.TabletInfo,
   646  	opts EmergencyReparentOptions,
   647  ) (candidate *topodatapb.Tablet, err error) {
   648  	defer func() {
   649  		if candidate != nil {
   650  			erp.logger.Infof("found better candidate - %v", candidate.Alias)
   651  		}
   652  	}()
   653  
   654  	if len(validCandidates) == 0 {
   655  		return nil, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "no valid candidates for emergency reparent")
   656  	}
   657  
   658  	if opts.NewPrimaryAlias != nil {
   659  		// explicit request to promote a specific tablet
   660  		requestedPrimaryAlias := topoproto.TabletAliasString(opts.NewPrimaryAlias)
   661  		requestedPrimaryInfo, isFound := tabletMap[requestedPrimaryAlias]
   662  		if !isFound {
   663  			return nil, vterrors.Errorf(vtrpc.Code_INTERNAL, "candidate %v not found in the tablet map; this an impossible situation", requestedPrimaryAlias)
   664  		}
   665  		if topoproto.IsTabletInList(requestedPrimaryInfo.Tablet, validCandidates) {
   666  			return requestedPrimaryInfo.Tablet, nil
   667  		}
   668  		return nil, vterrors.Errorf(vtrpc.Code_ABORTED, "requested candidate %v is not in valid candidates list", requestedPrimaryAlias)
   669  	}
   670  
   671  	// We have already selected an intermediate source which was selected based on the replication position
   672  	// (ties broken by promotion rules), but that tablet might not even be a valid candidate i.e. it could
   673  	// be in a different cell when we have PreventCrossCellPromotion specified, or it could have a promotion rule of
   674  	// MustNot. Even if it is valid, there could be a tablet with a better promotion rule. This is what we try to
   675  	// find here.
   676  	// We go over all the promotion rules in descending order of priority and try and find a valid candidate with
   677  	// that promotion rule.
   678  	// If the intermediate source has the same promotion rules as some other tablets, then we prioritize using
   679  	// the intermediate source since we won't have to wait for the new candidate to catch up!
   680  	for _, promotionRule := range promotionrule.AllPromotionRules() {
   681  		candidates := getTabletsWithPromotionRules(opts.durability, validCandidates, promotionRule)
   682  		candidate = findCandidate(intermediateSource, candidates)
   683  		if candidate != nil {
   684  			return candidate, nil
   685  		}
   686  	}
   687  	// Unreachable code.
   688  	// We should have found atleast 1 tablet in the valid list.
   689  	// If the list is empty, then we should have errored out much sooner.
   690  	return nil, vterrors.Errorf(vtrpc.Code_INTERNAL, "unreachable - did not find a valid primary candidate even though the valid candidate list was non-empty")
   691  }
   692  
   693  func (erp *EmergencyReparenter) promoteNewPrimary(
   694  	ctx context.Context,
   695  	ev *events.Reparent,
   696  	newPrimary *topodatapb.Tablet,
   697  	opts EmergencyReparentOptions,
   698  	tabletMap map[string]*topo.TabletInfo,
   699  	statusMap map[string]*replicationdatapb.StopReplicationStatus,
   700  ) error {
   701  	var err error
   702  	if ev.ShardInfo.PrimaryAlias == nil {
   703  		erp.logger.Infof("setting up %v as new primary for an uninitialized cluster", newPrimary.Alias)
   704  		// we call InitPrimary when the PrimaryAlias in the ShardInfo is empty. This happens when we have an uninitialized cluster.
   705  		_, err = erp.tmc.InitPrimary(ctx, newPrimary, SemiSyncAckers(opts.durability, newPrimary) > 0)
   706  	} else {
   707  		erp.logger.Infof("starting promotion for the new primary - %v", newPrimary.Alias)
   708  		// we call PromoteReplica which changes the tablet type, fixes the semi-sync, set the primary to read-write and flushes the binlogs
   709  		_, err = erp.tmc.PromoteReplica(ctx, newPrimary, SemiSyncAckers(opts.durability, newPrimary) > 0)
   710  	}
   711  	if err != nil {
   712  		return vterrors.Wrapf(err, "primary-elect tablet %v failed to be upgraded to primary: %v", newPrimary.Alias, err)
   713  	}
   714  	// we now reparent all the replicas to the new primary we have promoted.
   715  	// Here we do not need to wait for all the replicas, We can finish early when even 1 succeeds.
   716  	_, err = erp.reparentReplicas(ctx, ev, newPrimary, tabletMap, statusMap, opts, false /* waitForAllReplicas */, true /* populateReparentJournal */)
   717  	if err != nil {
   718  		return err
   719  	}
   720  	return nil
   721  }
   722  
   723  // filterValidCandidates filters valid tablets, keeping only the ones which can successfully be promoted without any constraint failures and can make forward progress on being promoted
   724  func (erp *EmergencyReparenter) filterValidCandidates(validTablets []*topodatapb.Tablet, tabletsReachable []*topodatapb.Tablet, prevPrimary *topodatapb.Tablet, opts EmergencyReparentOptions) ([]*topodatapb.Tablet, error) {
   725  	var restrictedValidTablets []*topodatapb.Tablet
   726  	for _, tablet := range validTablets {
   727  		tabletAliasStr := topoproto.TabletAliasString(tablet.Alias)
   728  		// Remove tablets which have MustNot promote rule since they must never be promoted
   729  		if PromotionRule(opts.durability, tablet) == promotionrule.MustNot {
   730  			erp.logger.Infof("Removing %s from list of valid candidates for promotion because it has the Must Not promote rule", tabletAliasStr)
   731  			if opts.NewPrimaryAlias != nil && topoproto.TabletAliasEqual(opts.NewPrimaryAlias, tablet.Alias) {
   732  				return nil, vterrors.Errorf(vtrpc.Code_ABORTED, "proposed primary %s has a must not promotion rule", topoproto.TabletAliasString(opts.NewPrimaryAlias))
   733  			}
   734  			continue
   735  		}
   736  		// If ERS is configured to prevent cross cell promotions, remove any tablet not from the same cell as the previous primary
   737  		if opts.PreventCrossCellPromotion && prevPrimary != nil && tablet.Alias.Cell != prevPrimary.Alias.Cell {
   738  			erp.logger.Infof("Removing %s from list of valid candidates for promotion because it isn't in the same cell as the previous primary", tabletAliasStr)
   739  			if opts.NewPrimaryAlias != nil && topoproto.TabletAliasEqual(opts.NewPrimaryAlias, tablet.Alias) {
   740  				return nil, vterrors.Errorf(vtrpc.Code_ABORTED, "proposed primary %s is is a different cell as the previous primary", topoproto.TabletAliasString(opts.NewPrimaryAlias))
   741  			}
   742  			continue
   743  		}
   744  		// Remove any tablet which cannot make forward progress using the list of tablets we have reached
   745  		if !canEstablishForTablet(opts.durability, tablet, tabletsReachable) {
   746  			erp.logger.Infof("Removing %s from list of valid candidates for promotion because it will not be able to make forward progress on promotion with the tablets currently reachable", tabletAliasStr)
   747  			if opts.NewPrimaryAlias != nil && topoproto.TabletAliasEqual(opts.NewPrimaryAlias, tablet.Alias) {
   748  				return nil, vterrors.Errorf(vtrpc.Code_ABORTED, "proposed primary %s will not be able to make forward progress on being promoted", topoproto.TabletAliasString(opts.NewPrimaryAlias))
   749  			}
   750  			continue
   751  		}
   752  		restrictedValidTablets = append(restrictedValidTablets, tablet)
   753  	}
   754  	return restrictedValidTablets, nil
   755  }