vitess.io/vitess@v0.16.2/go/vt/vtctl/reparentutil/planned_reparenter.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8  	http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package reparentutil
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sync"
    23  	"time"
    24  
    25  	"google.golang.org/protobuf/proto"
    26  
    27  	"vitess.io/vitess/go/event"
    28  	"vitess.io/vitess/go/mysql"
    29  	"vitess.io/vitess/go/vt/concurrency"
    30  	"vitess.io/vitess/go/vt/logutil"
    31  	"vitess.io/vitess/go/vt/topo"
    32  	"vitess.io/vitess/go/vt/topo/topoproto"
    33  	"vitess.io/vitess/go/vt/topotools/events"
    34  	"vitess.io/vitess/go/vt/vterrors"
    35  	"vitess.io/vitess/go/vt/vttablet/tmclient"
    36  
    37  	logutilpb "vitess.io/vitess/go/vt/proto/logutil"
    38  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    39  	"vitess.io/vitess/go/vt/proto/vtrpc"
    40  )
    41  
    42  // PlannedReparenter performs PlannedReparentShard operations.
    43  type PlannedReparenter struct {
    44  	ts     *topo.Server
    45  	tmc    tmclient.TabletManagerClient
    46  	logger logutil.Logger
    47  }
    48  
    49  // PlannedReparentOptions provides optional parameters to PlannedReparentShard
    50  // operations. Options are passed by value, so it is safe for callers to mutate
    51  // resue options structs for multiple calls.
    52  type PlannedReparentOptions struct {
    53  	NewPrimaryAlias     *topodatapb.TabletAlias
    54  	AvoidPrimaryAlias   *topodatapb.TabletAlias
    55  	WaitReplicasTimeout time.Duration
    56  
    57  	// Private options managed internally. We use value-passing semantics to
    58  	// set these options inside a PlannedReparent without leaking these details
    59  	// back out to the caller.
    60  
    61  	lockAction string
    62  	durability Durabler
    63  }
    64  
    65  // NewPlannedReparenter returns a new PlannedReparenter object, ready to perform
    66  // PlannedReparentShard operations using the given topo.Server,
    67  // TabletManagerClient, and logger.
    68  //
    69  // Providing a nil logger instance is allowed.
    70  func NewPlannedReparenter(ts *topo.Server, tmc tmclient.TabletManagerClient, logger logutil.Logger) *PlannedReparenter {
    71  	pr := PlannedReparenter{
    72  		ts:     ts,
    73  		tmc:    tmc,
    74  		logger: logger,
    75  	}
    76  
    77  	if pr.logger == nil {
    78  		// Create a no-op logger so we can call functions on pr.logger without
    79  		// needing to constantly check it for non-nil first.
    80  		pr.logger = logutil.NewCallbackLogger(func(e *logutilpb.Event) {})
    81  	}
    82  
    83  	return &pr
    84  }
    85  
    86  // ReparentShard performs a PlannedReparentShard operation on the given keyspace
    87  // and shard. It will make the provided tablet the primary for the shard, when
    88  // both the current and desired primary are reachable and in a good state.
    89  func (pr *PlannedReparenter) ReparentShard(ctx context.Context, keyspace string, shard string, opts PlannedReparentOptions) (*events.Reparent, error) {
    90  	var err error
    91  	if err = topo.CheckShardLocked(ctx, keyspace, shard); err != nil {
    92  		var unlock func(*error)
    93  		opts.lockAction = pr.getLockAction(opts)
    94  		ctx, unlock, err = pr.ts.LockShard(ctx, keyspace, shard, opts.lockAction)
    95  		if err != nil {
    96  			return nil, err
    97  		}
    98  		defer unlock(&err)
    99  	}
   100  
   101  	if opts.NewPrimaryAlias == nil && opts.AvoidPrimaryAlias == nil {
   102  		shardInfo, err := pr.ts.GetShard(ctx, keyspace, shard)
   103  		if err != nil {
   104  			return nil, err
   105  		}
   106  
   107  		opts.AvoidPrimaryAlias = shardInfo.PrimaryAlias
   108  	}
   109  
   110  	ev := &events.Reparent{}
   111  	defer func() {
   112  		switch err {
   113  		case nil:
   114  			event.DispatchUpdate(ev, "finished PlannedReparentShard")
   115  		default:
   116  			event.DispatchUpdate(ev, "failed PlannedReparentShard: "+err.Error())
   117  		}
   118  	}()
   119  
   120  	err = pr.reparentShardLocked(ctx, ev, keyspace, shard, opts)
   121  
   122  	return ev, err
   123  }
   124  
   125  func (pr *PlannedReparenter) getLockAction(opts PlannedReparentOptions) string {
   126  	return fmt.Sprintf(
   127  		"PlannedReparentShard(%v, AvoidPrimary = %v)",
   128  		topoproto.TabletAliasString(opts.NewPrimaryAlias),
   129  		topoproto.TabletAliasString(opts.AvoidPrimaryAlias),
   130  	)
   131  }
   132  
   133  // preflightChecks checks some invariants that pr.reparentShardLocked() depends
   134  // on. It returns a boolean to indicate if the reparent is a no-op (which
   135  // happens iff the caller specified an AvoidPrimaryAlias and it's not the shard
   136  // primary), as well as an error.
   137  //
   138  // It will also set the NewPrimaryAlias option if the caller did not specify
   139  // one, provided it can choose a new primary candidate. See ChooseNewPrimary()
   140  // for details on primary candidate selection.
   141  func (pr *PlannedReparenter) preflightChecks(
   142  	ctx context.Context,
   143  	ev *events.Reparent,
   144  	keyspace string,
   145  	shard string,
   146  	tabletMap map[string]*topo.TabletInfo,
   147  	opts *PlannedReparentOptions, // we take a pointer here to set NewPrimaryAlias
   148  ) (isNoop bool, err error) {
   149  	// We don't want to fail when both NewPrimaryAlias and AvoidPrimaryAlias are nil.
   150  	// But when they are both nil, we assign AvoidPrimaryAlias to be ShardInfo.PrimaryAlias.
   151  	// In the case, where we are using PRS to initialize the cluster without specifying the NewPrimaryAlias
   152  	// all the three will be nil.
   153  	if opts.NewPrimaryAlias != nil && topoproto.TabletAliasEqual(opts.NewPrimaryAlias, opts.AvoidPrimaryAlias) {
   154  		return true, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "primary-elect tablet %v is the same as the tablet to avoid", topoproto.TabletAliasString(opts.NewPrimaryAlias))
   155  	}
   156  
   157  	if opts.NewPrimaryAlias == nil {
   158  		// We don't want to fail when both ShardInfo.PrimaryAlias and AvoidPrimaryAlias are nil.
   159  		// This happens when we are using PRS to initialize the cluster without specifying the NewPrimaryAlias
   160  		if ev.ShardInfo.PrimaryAlias != nil && !topoproto.TabletAliasEqual(opts.AvoidPrimaryAlias, ev.ShardInfo.PrimaryAlias) {
   161  			event.DispatchUpdate(ev, "current primary is different than tablet to avoid, nothing to do")
   162  			return true, nil
   163  		}
   164  
   165  		event.DispatchUpdate(ev, "searching for primary candidate")
   166  
   167  		opts.NewPrimaryAlias, err = ChooseNewPrimary(ctx, pr.tmc, &ev.ShardInfo, tabletMap, opts.AvoidPrimaryAlias, opts.WaitReplicasTimeout, opts.durability, pr.logger)
   168  		if err != nil {
   169  			return true, err
   170  		}
   171  
   172  		if opts.NewPrimaryAlias == nil {
   173  			return true, vterrors.Errorf(vtrpc.Code_INTERNAL, "cannot find a tablet to reparent to in the same cell as the current primary")
   174  		}
   175  
   176  		pr.logger.Infof("elected new primary candidate %v", topoproto.TabletAliasString(opts.NewPrimaryAlias))
   177  		event.DispatchUpdate(ev, "elected new primary candidate")
   178  	}
   179  
   180  	primaryElectAliasStr := topoproto.TabletAliasString(opts.NewPrimaryAlias)
   181  
   182  	newPrimaryTabletInfo, ok := tabletMap[primaryElectAliasStr]
   183  	if !ok {
   184  		return true, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "primary-elect tablet %v is not in the shard", primaryElectAliasStr)
   185  	}
   186  
   187  	// PRS is only meant to be called when all the tablets are healthy.
   188  	// So we assume that all the tablets are reachable and check if the primary elect will be able
   189  	// to make progress if it is promoted. This is needed because sometimes users may ask to promote
   190  	// a tablet which can never make progress. For example, let's say the user has a durability policy
   191  	// where they require 2 semi-sync acks but from cross-cell replicas.
   192  	// Let's say they have 3 replicas A in zone 1 and B and C in zone 2. In this case, A is the only
   193  	// eligible primary elect. Both B and C won't be able to make forward progress if they are promoted.
   194  	var tabletsReachable []*topodatapb.Tablet
   195  	for _, info := range tabletMap {
   196  		tabletsReachable = append(tabletsReachable, info.Tablet)
   197  	}
   198  	if !canEstablishForTablet(opts.durability, newPrimaryTabletInfo.Tablet, tabletsReachable) {
   199  		return true, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "primary-elect tablet %v won't be able to make forward progress on promotion", primaryElectAliasStr)
   200  	}
   201  
   202  	ev.NewPrimary = proto.Clone(newPrimaryTabletInfo.Tablet).(*topodatapb.Tablet)
   203  
   204  	return false, nil
   205  }
   206  
   207  func (pr *PlannedReparenter) performGracefulPromotion(
   208  	ctx context.Context,
   209  	ev *events.Reparent,
   210  	keyspace string,
   211  	shard string,
   212  	currentPrimary *topo.TabletInfo,
   213  	primaryElect *topodatapb.Tablet,
   214  	tabletMap map[string]*topo.TabletInfo,
   215  	opts PlannedReparentOptions,
   216  ) (string, error) {
   217  	primaryElectAliasStr := topoproto.TabletAliasString(primaryElect.Alias)
   218  	ev.OldPrimary = proto.Clone(currentPrimary.Tablet).(*topodatapb.Tablet)
   219  
   220  	// Before demoting the old primary, we're going to ensure that replication
   221  	// is working from the old primary to the primary-elect. If replication is
   222  	// not working, a PlannedReparent is not safe to do, because the candidate
   223  	// won't catch up and we'll potentially miss transactions.
   224  	pr.logger.Infof("checking replication on primary-elect %v", primaryElectAliasStr)
   225  
   226  	// First, we find the position of the current primary. Note that this is
   227  	// just a snapshot of the position, since we let it keep accepting writes
   228  	// until we're sure we want to proceed with the promotion.
   229  	snapshotCtx, snapshotCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
   230  	defer snapshotCancel()
   231  
   232  	snapshotPos, err := pr.tmc.PrimaryPosition(snapshotCtx, currentPrimary.Tablet)
   233  	if err != nil {
   234  		return "", vterrors.Wrapf(err, "cannot get replication position on current primary %v; current primary must be healthy to perform PlannedReparent", currentPrimary.AliasString())
   235  	}
   236  
   237  	// Next, we wait for the primary-elect to catch up to that snapshot point.
   238  	// If it can catch up within WaitReplicasTimeout, we can be fairly
   239  	// confident that it will catch up on everything else that happens between
   240  	// the snapshot point we grabbed above and when we demote the old primary
   241  	// below.
   242  	//
   243  	// We do this as an idempotent SetReplicationSource to make sure the replica knows who
   244  	// the current primary is.
   245  	setSourceCtx, setSourceCancel := context.WithTimeout(ctx, opts.WaitReplicasTimeout)
   246  	defer setSourceCancel()
   247  
   248  	if err := pr.tmc.SetReplicationSource(setSourceCtx, primaryElect, currentPrimary.Alias, 0, snapshotPos, true, IsReplicaSemiSync(opts.durability, currentPrimary.Tablet, primaryElect)); err != nil {
   249  		return "", vterrors.Wrapf(err, "replication on primary-elect %v did not catch up in time; replication must be healthy to perform PlannedReparent", primaryElectAliasStr)
   250  	}
   251  
   252  	// Verify we still have the topology lock before doing the demotion.
   253  	if err := topo.CheckShardLocked(ctx, keyspace, shard); err != nil {
   254  		return "", vterrors.Wrap(err, "lost topology lock; aborting")
   255  	}
   256  
   257  	// Next up, demote the current primary and get its replication position.
   258  	// It's fine if the current primary was already demoted, since DemotePrimary
   259  	// is idempotent.
   260  	pr.logger.Infof("demoting current primary: %v", currentPrimary.AliasString())
   261  	event.DispatchUpdate(ev, "demoting old primary")
   262  
   263  	demoteCtx, demoteCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
   264  	defer demoteCancel()
   265  
   266  	primaryStatus, err := pr.tmc.DemotePrimary(demoteCtx, currentPrimary.Tablet)
   267  	if err != nil {
   268  		return "", vterrors.Wrapf(err, "failed to DemotePrimary on current primary %v: %v", currentPrimary.AliasString(), err)
   269  	}
   270  
   271  	// Wait for the primary-elect to catch up to the position we demoted the
   272  	// current primary at. If it fails to catch up within WaitReplicasTimeout,
   273  	// we will try to roll back to the original primary before aborting.
   274  	waitCtx, waitCancel := context.WithTimeout(ctx, opts.WaitReplicasTimeout)
   275  	defer waitCancel()
   276  
   277  	waitErr := pr.tmc.WaitForPosition(waitCtx, primaryElect, primaryStatus.Position)
   278  
   279  	// Do some wrapping of errors to get the right codes and callstacks.
   280  	var finalWaitErr error
   281  	switch {
   282  	case waitErr != nil:
   283  		finalWaitErr = vterrors.Wrapf(waitErr, "primary-elect tablet %v failed to catch up with replication %v", primaryElectAliasStr, primaryStatus.Position)
   284  	case ctx.Err() == context.DeadlineExceeded:
   285  		finalWaitErr = vterrors.New(vtrpc.Code_DEADLINE_EXCEEDED, "PlannedReparent timed out; please try again")
   286  	}
   287  
   288  	if finalWaitErr != nil {
   289  		// It's possible that we've used up the calling context's timeout, or
   290  		// that not enough time is left on the it to finish the rollback.
   291  		// We create a new background context to avoid a partial rollback, which
   292  		// could leave the cluster in a worse state than when we started.
   293  		undoCtx, undoCancel := context.WithTimeout(context.Background(), topo.RemoteOperationTimeout)
   294  		defer undoCancel()
   295  
   296  		if undoErr := pr.tmc.UndoDemotePrimary(undoCtx, currentPrimary.Tablet, SemiSyncAckers(opts.durability, currentPrimary.Tablet) > 0); undoErr != nil {
   297  			pr.logger.Warningf("encountered error while performing UndoDemotePrimary(%v): %v", currentPrimary.AliasString(), undoErr)
   298  			finalWaitErr = vterrors.Wrapf(finalWaitErr, "encountered error while performing UndoDemotePrimary(%v): %v", currentPrimary.AliasString(), undoErr)
   299  		}
   300  
   301  		return "", finalWaitErr
   302  	}
   303  
   304  	// Primary-elect is caught up to the current primary. We can do the
   305  	// promotion now.
   306  	promoteCtx, promoteCancel := context.WithTimeout(ctx, opts.WaitReplicasTimeout)
   307  	defer promoteCancel()
   308  
   309  	rp, err := pr.tmc.PromoteReplica(promoteCtx, primaryElect, SemiSyncAckers(opts.durability, primaryElect) > 0)
   310  	if err != nil {
   311  		return "", vterrors.Wrapf(err, "primary-elect tablet %v failed to be promoted to primary; please try again", primaryElectAliasStr)
   312  	}
   313  
   314  	if ctx.Err() == context.DeadlineExceeded {
   315  		// PromoteReplica succeeded, but we ran out of time. PRS needs to be
   316  		// re-run to complete fully.
   317  		return "", vterrors.Errorf(vtrpc.Code_DEADLINE_EXCEEDED, "PLannedReparent timed out after successfully promoting primary-elect %v; please re-run to fix up the replicas", primaryElectAliasStr)
   318  	}
   319  
   320  	return rp, nil
   321  }
   322  
   323  func (pr *PlannedReparenter) performInitialPromotion(
   324  	ctx context.Context,
   325  	primaryElect *topodatapb.Tablet,
   326  	opts PlannedReparentOptions,
   327  ) (string, error) {
   328  	primaryElectAliasStr := topoproto.TabletAliasString(primaryElect.Alias)
   329  	promoteCtx, promoteCancel := context.WithTimeout(ctx, opts.WaitReplicasTimeout)
   330  	defer promoteCancel()
   331  
   332  	// During the initialization phase we have to use InitPrimary instead of PromoteReplica
   333  	// This is because the two operations while being largely similar have a very subtle difference
   334  	// InitPrimary first sets the MySQL instance to read-write and creates the database (if it does not exist)
   335  	// before it fixes the semi sync.
   336  	// PromoteReplica on the other hand, first fixes semi-sync before setting the MySQL instance to read-write.
   337  	// This is done to guarantee safety, in the sense that the semi-sync is on before we start accepting writes.
   338  	// However, during initialization, it is likely that the database would not be created in the MySQL instance.
   339  	// Therefore, we have to first set read-write mode, create the database and then fix semi-sync, otherwise we get blocked.
   340  	rp, err := pr.tmc.InitPrimary(promoteCtx, primaryElect, SemiSyncAckers(opts.durability, primaryElect) > 0)
   341  	if err != nil {
   342  		return "", vterrors.Wrapf(err, "primary-elect tablet %v failed to be promoted to primary; please try again", primaryElectAliasStr)
   343  	}
   344  
   345  	if ctx.Err() == context.DeadlineExceeded {
   346  		// InitPrimary succeeded, but we ran out of time. PRS needs to be
   347  		// re-run to complete fully.
   348  		return "", vterrors.Errorf(vtrpc.Code_DEADLINE_EXCEEDED, "PLannedReparent timed out after successfully promoting primary-elect %v; please re-run to fix up the replicas", primaryElectAliasStr)
   349  	}
   350  
   351  	return rp, nil
   352  }
   353  
   354  func (pr *PlannedReparenter) performPartialPromotionRecovery(ctx context.Context, primaryElect *topodatapb.Tablet) (string, error) {
   355  	// It's possible that a previous attempt to reparent failed to SetReadWrite,
   356  	// so call it here to make sure the underlying MySQL is read-write on the
   357  	// candidate primary.
   358  	setReadWriteCtx, setReadWriteCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
   359  	defer setReadWriteCancel()
   360  
   361  	if err := pr.tmc.SetReadWrite(setReadWriteCtx, primaryElect); err != nil {
   362  		return "", vterrors.Wrapf(err, "failed to SetReadWrite on current primary %v", topoproto.TabletAliasString(primaryElect.Alias))
   363  	}
   364  
   365  	// The primary is already the one we want according to its tablet record.
   366  	refreshCtx, refreshCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
   367  	defer refreshCancel()
   368  
   369  	// Get the replication position so we can try to fix the replicas (back in
   370  	// reparentShardLocked())
   371  	reparentJournalPosition, err := pr.tmc.PrimaryPosition(refreshCtx, primaryElect)
   372  	if err != nil {
   373  		return "", vterrors.Wrapf(err, "failed to get replication position of current primary %v", topoproto.TabletAliasString(primaryElect.Alias))
   374  	}
   375  
   376  	return reparentJournalPosition, nil
   377  }
   378  
   379  func (pr *PlannedReparenter) performPotentialPromotion(
   380  	ctx context.Context,
   381  	keyspace string,
   382  	shard string,
   383  	primaryElect *topodatapb.Tablet,
   384  	tabletMap map[string]*topo.TabletInfo,
   385  	opts PlannedReparentOptions,
   386  ) (string, error) {
   387  	primaryElectAliasStr := topoproto.TabletAliasString(primaryElect.Alias)
   388  
   389  	pr.logger.Infof("no clear winner found for current primary term; checking if it's safe to recover by electing %v", primaryElectAliasStr)
   390  
   391  	type tabletPos struct {
   392  		alias  string
   393  		tablet *topodatapb.Tablet
   394  		pos    mysql.Position
   395  	}
   396  
   397  	positions := make(chan tabletPos, len(tabletMap))
   398  
   399  	// First, stop the world, to ensure no writes are happening anywhere. We
   400  	// don't trust that we know which tablets might be acting as primaries, so
   401  	// we simply demote everyone.
   402  	//
   403  	// Unlike the normal, single-primary case, we don't try to undo this if we
   404  	// fail. If we've made it here, it means there is no clear primary, so we
   405  	// don't know who it's safe to roll back to. Leaving everything read-only is
   406  	// probably safer, or at least no worse, than whatever weird state we were
   407  	// in before.
   408  	//
   409  	// If any tablets are unreachable, we can't be sure it's safe either,
   410  	// because one of the unreachable tablets might have a replication position
   411  	// further ahead than the candidate primary.
   412  
   413  	var (
   414  		stopAllWg sync.WaitGroup
   415  		rec       concurrency.AllErrorRecorder
   416  	)
   417  
   418  	stopAllCtx, stopAllCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
   419  	defer stopAllCancel()
   420  
   421  	for alias, tabletInfo := range tabletMap {
   422  		stopAllWg.Add(1)
   423  
   424  		go func(alias string, tablet *topodatapb.Tablet) {
   425  			defer stopAllWg.Done()
   426  
   427  			// Regardless of what type this tablet thinks it is, we will always
   428  			// call DemotePrimary to ensure the underlying MySQL server is in
   429  			// read-only, and to check its replication position. DemotePrimary is
   430  			// idempotent, so it's fine to call it on a replica (or other
   431  			// tablet type), that's already in read-only.
   432  			pr.logger.Infof("demoting tablet %v", alias)
   433  
   434  			primaryStatus, err := pr.tmc.DemotePrimary(stopAllCtx, tablet)
   435  			if err != nil {
   436  				rec.RecordError(vterrors.Wrapf(err, "DemotePrimary(%v) failed on contested primary", alias))
   437  
   438  				return
   439  			}
   440  
   441  			pos, err := mysql.DecodePosition(primaryStatus.Position)
   442  			if err != nil {
   443  				rec.RecordError(vterrors.Wrapf(err, "cannot decode replication position (%v) for demoted tablet %v", primaryStatus.Position, alias))
   444  
   445  				return
   446  			}
   447  
   448  			positions <- tabletPos{
   449  				alias:  alias,
   450  				tablet: tablet,
   451  				pos:    pos,
   452  			}
   453  		}(alias, tabletInfo.Tablet)
   454  	}
   455  
   456  	stopAllWg.Wait()
   457  	close(positions)
   458  
   459  	if rec.HasErrors() {
   460  		return "", vterrors.Wrap(rec.Error(), "failed to demote all tablets")
   461  	}
   462  
   463  	// Construct a mapping of alias to tablet position.
   464  	tabletPosMap := make(map[string]tabletPos, len(tabletMap))
   465  	for tp := range positions {
   466  		tabletPosMap[tp.alias] = tp
   467  	}
   468  
   469  	// Make sure no tablet has a more advanced position than the candidate
   470  	// primary. It's up to the caller to choose a suitable candidate, and to
   471  	// choose another if this check fails.
   472  	//
   473  	// Note that we still allow replication to run during this time, but we
   474  	// assume that no new high water mark can appear because we just demoted all
   475  	// tablets to read-only, so there should be no new transactions.
   476  	//
   477  	// TODO: consider temporarily replicating from another tablet to catch up,
   478  	// if the candidate primary is behind that tablet.
   479  	tp, ok := tabletPosMap[primaryElectAliasStr]
   480  	if !ok {
   481  		return "", vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "primary-elect tablet %v not found in tablet map", primaryElectAliasStr)
   482  	}
   483  
   484  	primaryElectPos := tp.pos
   485  
   486  	for _, tp := range tabletPosMap {
   487  		// The primary-elect pos has to be at least as advanced as every tablet
   488  		// in the shard.
   489  		if !primaryElectPos.AtLeast(tp.pos) {
   490  			return "", vterrors.Errorf(
   491  				vtrpc.Code_FAILED_PRECONDITION,
   492  				"tablet %v (position: %v) contains transactions not found in primary-elect %v (position: %v)",
   493  				tp.alias, tp.pos, primaryElectAliasStr, primaryElectPos,
   494  			)
   495  		}
   496  	}
   497  
   498  	// Check that we still have the topology lock.
   499  	if err := topo.CheckShardLocked(ctx, keyspace, shard); err != nil {
   500  		return "", vterrors.Wrap(err, "lost topology lock; aborting")
   501  	}
   502  
   503  	// Promote the candidate primary to type:PRIMARY.
   504  	promoteCtx, promoteCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
   505  	defer promoteCancel()
   506  
   507  	rp, err := pr.tmc.PromoteReplica(promoteCtx, primaryElect, SemiSyncAckers(opts.durability, primaryElect) > 0)
   508  	if err != nil {
   509  		return "", vterrors.Wrapf(err, "failed to promote %v to primary", primaryElectAliasStr)
   510  	}
   511  
   512  	return rp, nil
   513  }
   514  
   515  func (pr *PlannedReparenter) reparentShardLocked(
   516  	ctx context.Context,
   517  	ev *events.Reparent,
   518  	keyspace string,
   519  	shard string,
   520  	opts PlannedReparentOptions,
   521  ) error {
   522  	shardInfo, err := pr.ts.GetShard(ctx, keyspace, shard)
   523  	if err != nil {
   524  		return err
   525  	}
   526  
   527  	keyspaceDurability, err := pr.ts.GetKeyspaceDurability(ctx, keyspace)
   528  	if err != nil {
   529  		return err
   530  	}
   531  
   532  	pr.logger.Infof("Getting a new durability policy for %v", keyspaceDurability)
   533  	opts.durability, err = GetDurabilityPolicy(keyspaceDurability)
   534  	if err != nil {
   535  		return err
   536  	}
   537  
   538  	ev.ShardInfo = *shardInfo
   539  
   540  	event.DispatchUpdate(ev, "reading tablet map")
   541  
   542  	tabletMap, err := pr.ts.GetTabletMapForShard(ctx, keyspace, shard)
   543  	if err != nil {
   544  		return err
   545  	}
   546  
   547  	// Check invariants that PlannedReparentShard depends on.
   548  	if isNoop, err := pr.preflightChecks(ctx, ev, keyspace, shard, tabletMap, &opts); err != nil {
   549  		return err
   550  	} else if isNoop {
   551  		return nil
   552  	}
   553  
   554  	currentPrimary := FindCurrentPrimary(tabletMap, pr.logger)
   555  	reparentJournalPos := ""
   556  	// needsRefresh is used to keep track of whether we need to refresh the state
   557  	// of the new primary tablet. The only case that we need to reload the state
   558  	// is when we are initializing the new primary. The reason is that the first
   559  	// time we try to setup all the components like vreplication.Engine, they fail
   560  	// since the database isn't created until we setServing.
   561  	// A call to Refresh state fixes all the components. This isn't strictly necessary
   562  	// in the sense that all the components will retry initialization anyways after some
   563  	// time, so even without a call to RefreshState, they all converge correctly.
   564  	needsRefresh := false
   565  
   566  	// Depending on whether we can find a current primary, and what the caller
   567  	// specified as the candidate primary, we will do one of four kinds of
   568  	// promotions:
   569  	// 1) There is no current primary and the shard info also does not have
   570  	// anything stored. This happens when none of the tablets have ever been promoted.
   571  	// So we can promote the primary-elect without any issues. After that all we need
   572  	// to do is to reparent all the tablets to that primary which is accomplished in the
   573  	// common code path.
   574  	//
   575  	// 2) There is no clear current primary. In this case we will try to
   576  	// determine if it's safe to promote the candidate specified by the caller.
   577  	// If it's not -- including if any tablet in the shard is unreachable -- we
   578  	// bail. We also don't attempt to rollback a failed demotion in this case.
   579  	//
   580  	// 3) The current primary is the same as the candidate primary specified by
   581  	// the caller. In this case, we assume there was a previous PRS for this
   582  	// primary, and the caller is re-issuing the call to fix-up any replicas. We
   583  	// also idempotently set the desired primary as read-write, just in case.
   584  	//
   585  	// 4) The current primary and the desired primary differ. In this case, we
   586  	// perform a graceful promotion, in which we validate the desired primary is
   587  	// sufficiently up-to-date, demote the current primary, wait for the desired
   588  	// primary to catch up to that position, and set the desired primary
   589  	// read-write. We will attempt to rollback a failed demotion in this case,
   590  	// unlike in case (1), because we have a known good state to rollback to.
   591  	//
   592  	// In all cases, we will retrieve the reparent journal position that was
   593  	// inserted in the new primary's journal, so we can use it below to check
   594  	// that all the replicas have attached to new primary successfully.
   595  	switch {
   596  	case currentPrimary == nil && ev.ShardInfo.PrimaryAlias == nil:
   597  		// Case (1): no primary has been elected ever. Initialize
   598  		// the primary-elect tablet
   599  		reparentJournalPos, err = pr.performInitialPromotion(ctx, ev.NewPrimary, opts)
   600  		needsRefresh = true
   601  	case currentPrimary == nil && ev.ShardInfo.PrimaryAlias != nil:
   602  		// Case (2): no clear current primary. Try to find a safe promotion
   603  		// candidate, and promote to it.
   604  		reparentJournalPos, err = pr.performPotentialPromotion(ctx, keyspace, shard, ev.NewPrimary, tabletMap, opts)
   605  	case topoproto.TabletAliasEqual(currentPrimary.Alias, opts.NewPrimaryAlias):
   606  		// Case (3): desired new primary is the current primary. Attempt to fix
   607  		// up replicas to recover from a previous partial promotion.
   608  		reparentJournalPos, err = pr.performPartialPromotionRecovery(ctx, ev.NewPrimary)
   609  	default:
   610  		// Case (4): desired primary and current primary differ. Do a graceful
   611  		// demotion-then-promotion.
   612  		reparentJournalPos, err = pr.performGracefulPromotion(ctx, ev, keyspace, shard, currentPrimary, ev.NewPrimary, tabletMap, opts)
   613  	}
   614  
   615  	if err != nil {
   616  		return err
   617  	}
   618  
   619  	if err := topo.CheckShardLocked(ctx, keyspace, shard); err != nil {
   620  		return vterrors.Wrap(err, "lost topology lock, aborting")
   621  	}
   622  
   623  	if err := pr.reparentTablets(ctx, ev, reparentJournalPos, tabletMap, opts); err != nil {
   624  		return err
   625  	}
   626  
   627  	if needsRefresh {
   628  		// Refresh the state to force the tabletserver to reconnect after db has been created.
   629  		if err := pr.tmc.RefreshState(ctx, ev.NewPrimary); err != nil {
   630  			pr.logger.Warningf("RefreshState failed: %v", err)
   631  		}
   632  	}
   633  	return nil
   634  }
   635  
   636  func (pr *PlannedReparenter) reparentTablets(
   637  	ctx context.Context,
   638  	ev *events.Reparent,
   639  	reparentJournalPosition string,
   640  	tabletMap map[string]*topo.TabletInfo,
   641  	opts PlannedReparentOptions,
   642  ) error {
   643  	// Create a cancellable context for the entire set of reparent operations.
   644  	// If any error conditions happen, we can cancel all outgoing RPCs.
   645  	replCtx, replCancel := context.WithTimeout(ctx, opts.WaitReplicasTimeout)
   646  	defer replCancel()
   647  
   648  	// Go thorugh all the tablets.
   649  	// - New primary: populate the reparent journal.
   650  	// - Everybody else: reparent to the new primary; wait for the reparent
   651  	//	 journal row.
   652  	event.DispatchUpdate(ev, "reparenting all tablets")
   653  
   654  	// We add a (hopefully) unique record to the reparent journal table on the
   655  	// new primary, so we can check if replicas got it through replication.
   656  	reparentJournalTimestamp := time.Now().UnixNano()
   657  	primaryElectAliasStr := topoproto.TabletAliasString(ev.NewPrimary.Alias)
   658  	replicasWg := sync.WaitGroup{}
   659  	rec := concurrency.AllErrorRecorder{}
   660  
   661  	// Point all replicas at the new primary and check that they receive the
   662  	// reparent journal entry, proving that they are replicating from the new
   663  	// primary. We do this concurrently with  adding the journal entry (after
   664  	// this loop), because if semi-sync is enabled, the update to the journal
   665  	// table will block until at least one replica is successfully attached to
   666  	// the new primary.
   667  	for alias, tabletInfo := range tabletMap {
   668  		if alias == primaryElectAliasStr {
   669  			continue
   670  		}
   671  
   672  		replicasWg.Add(1)
   673  
   674  		go func(alias string, tablet *topodatapb.Tablet) {
   675  			defer replicasWg.Done()
   676  			pr.logger.Infof("setting new primary on replica %v", alias)
   677  
   678  			// Note: we used to force replication to start on the old primary,
   679  			// but now that we support "resuming" a previously-failed PRS
   680  			// attempt, we can no longer assume that we know who the former
   681  			// primary was. Instead, we rely on the former primary to remember
   682  			// that it needs to start replication after transitioning from
   683  			// PRIMARY => REPLICA.
   684  			forceStartReplication := false
   685  			if err := pr.tmc.SetReplicationSource(replCtx, tablet, ev.NewPrimary.Alias, reparentJournalTimestamp, "", forceStartReplication, IsReplicaSemiSync(opts.durability, ev.NewPrimary, tablet)); err != nil {
   686  				rec.RecordError(vterrors.Wrapf(err, "tablet %v failed to SetReplicationSource(%v): %v", alias, primaryElectAliasStr, err))
   687  			}
   688  		}(alias, tabletInfo.Tablet)
   689  	}
   690  
   691  	// Add a reparent journal entry on the new primary. If semi-sync is enabled,
   692  	// this blocks until at least one replica is reparented (above) and
   693  	// successfully replicating from the new primary.
   694  	//
   695  	// If we fail to populate the reparent journal, there's no way the replicas
   696  	// will work, so we cancel the ongoing reparent RPCs and bail out.
   697  	pr.logger.Infof("populating reparent journal on new primary %v", primaryElectAliasStr)
   698  	if err := pr.tmc.PopulateReparentJournal(replCtx, ev.NewPrimary, reparentJournalTimestamp, "PlannedReparentShard", ev.NewPrimary.Alias, reparentJournalPosition); err != nil {
   699  		pr.logger.Warningf("primary failed to PopulateReparentJournal (position: %v); cancelling replica reparent attempts", reparentJournalPosition)
   700  		replCancel()
   701  		replicasWg.Wait()
   702  
   703  		return vterrors.Wrapf(err, "failed PopulateReparentJournal(primary=%v, ts=%v, pos=%v): %v", primaryElectAliasStr, reparentJournalTimestamp, reparentJournalPosition, err)
   704  	}
   705  
   706  	// Reparent journal has been populated on the new primary. We just need to
   707  	// wait for all the replicas to receive it.
   708  	replicasWg.Wait()
   709  
   710  	if err := rec.Error(); err != nil {
   711  		msg := "some replicas failed to reparent; retry PlannedReparentShard with the same new primary alias (%v) to retry failed replicas"
   712  		pr.logger.Errorf2(err, msg, primaryElectAliasStr)
   713  		return vterrors.Wrapf(err, msg, primaryElectAliasStr)
   714  	}
   715  
   716  	return nil
   717  }