vitess.io/vitess@v0.16.2/go/vt/vtctl/reparentutil/planned_reparenter.go (about) 1 /* 2 Copyright 2021 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package reparentutil 18 19 import ( 20 "context" 21 "fmt" 22 "sync" 23 "time" 24 25 "google.golang.org/protobuf/proto" 26 27 "vitess.io/vitess/go/event" 28 "vitess.io/vitess/go/mysql" 29 "vitess.io/vitess/go/vt/concurrency" 30 "vitess.io/vitess/go/vt/logutil" 31 "vitess.io/vitess/go/vt/topo" 32 "vitess.io/vitess/go/vt/topo/topoproto" 33 "vitess.io/vitess/go/vt/topotools/events" 34 "vitess.io/vitess/go/vt/vterrors" 35 "vitess.io/vitess/go/vt/vttablet/tmclient" 36 37 logutilpb "vitess.io/vitess/go/vt/proto/logutil" 38 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 39 "vitess.io/vitess/go/vt/proto/vtrpc" 40 ) 41 42 // PlannedReparenter performs PlannedReparentShard operations. 43 type PlannedReparenter struct { 44 ts *topo.Server 45 tmc tmclient.TabletManagerClient 46 logger logutil.Logger 47 } 48 49 // PlannedReparentOptions provides optional parameters to PlannedReparentShard 50 // operations. Options are passed by value, so it is safe for callers to mutate 51 // resue options structs for multiple calls. 52 type PlannedReparentOptions struct { 53 NewPrimaryAlias *topodatapb.TabletAlias 54 AvoidPrimaryAlias *topodatapb.TabletAlias 55 WaitReplicasTimeout time.Duration 56 57 // Private options managed internally. We use value-passing semantics to 58 // set these options inside a PlannedReparent without leaking these details 59 // back out to the caller. 60 61 lockAction string 62 durability Durabler 63 } 64 65 // NewPlannedReparenter returns a new PlannedReparenter object, ready to perform 66 // PlannedReparentShard operations using the given topo.Server, 67 // TabletManagerClient, and logger. 68 // 69 // Providing a nil logger instance is allowed. 70 func NewPlannedReparenter(ts *topo.Server, tmc tmclient.TabletManagerClient, logger logutil.Logger) *PlannedReparenter { 71 pr := PlannedReparenter{ 72 ts: ts, 73 tmc: tmc, 74 logger: logger, 75 } 76 77 if pr.logger == nil { 78 // Create a no-op logger so we can call functions on pr.logger without 79 // needing to constantly check it for non-nil first. 80 pr.logger = logutil.NewCallbackLogger(func(e *logutilpb.Event) {}) 81 } 82 83 return &pr 84 } 85 86 // ReparentShard performs a PlannedReparentShard operation on the given keyspace 87 // and shard. It will make the provided tablet the primary for the shard, when 88 // both the current and desired primary are reachable and in a good state. 89 func (pr *PlannedReparenter) ReparentShard(ctx context.Context, keyspace string, shard string, opts PlannedReparentOptions) (*events.Reparent, error) { 90 var err error 91 if err = topo.CheckShardLocked(ctx, keyspace, shard); err != nil { 92 var unlock func(*error) 93 opts.lockAction = pr.getLockAction(opts) 94 ctx, unlock, err = pr.ts.LockShard(ctx, keyspace, shard, opts.lockAction) 95 if err != nil { 96 return nil, err 97 } 98 defer unlock(&err) 99 } 100 101 if opts.NewPrimaryAlias == nil && opts.AvoidPrimaryAlias == nil { 102 shardInfo, err := pr.ts.GetShard(ctx, keyspace, shard) 103 if err != nil { 104 return nil, err 105 } 106 107 opts.AvoidPrimaryAlias = shardInfo.PrimaryAlias 108 } 109 110 ev := &events.Reparent{} 111 defer func() { 112 switch err { 113 case nil: 114 event.DispatchUpdate(ev, "finished PlannedReparentShard") 115 default: 116 event.DispatchUpdate(ev, "failed PlannedReparentShard: "+err.Error()) 117 } 118 }() 119 120 err = pr.reparentShardLocked(ctx, ev, keyspace, shard, opts) 121 122 return ev, err 123 } 124 125 func (pr *PlannedReparenter) getLockAction(opts PlannedReparentOptions) string { 126 return fmt.Sprintf( 127 "PlannedReparentShard(%v, AvoidPrimary = %v)", 128 topoproto.TabletAliasString(opts.NewPrimaryAlias), 129 topoproto.TabletAliasString(opts.AvoidPrimaryAlias), 130 ) 131 } 132 133 // preflightChecks checks some invariants that pr.reparentShardLocked() depends 134 // on. It returns a boolean to indicate if the reparent is a no-op (which 135 // happens iff the caller specified an AvoidPrimaryAlias and it's not the shard 136 // primary), as well as an error. 137 // 138 // It will also set the NewPrimaryAlias option if the caller did not specify 139 // one, provided it can choose a new primary candidate. See ChooseNewPrimary() 140 // for details on primary candidate selection. 141 func (pr *PlannedReparenter) preflightChecks( 142 ctx context.Context, 143 ev *events.Reparent, 144 keyspace string, 145 shard string, 146 tabletMap map[string]*topo.TabletInfo, 147 opts *PlannedReparentOptions, // we take a pointer here to set NewPrimaryAlias 148 ) (isNoop bool, err error) { 149 // We don't want to fail when both NewPrimaryAlias and AvoidPrimaryAlias are nil. 150 // But when they are both nil, we assign AvoidPrimaryAlias to be ShardInfo.PrimaryAlias. 151 // In the case, where we are using PRS to initialize the cluster without specifying the NewPrimaryAlias 152 // all the three will be nil. 153 if opts.NewPrimaryAlias != nil && topoproto.TabletAliasEqual(opts.NewPrimaryAlias, opts.AvoidPrimaryAlias) { 154 return true, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "primary-elect tablet %v is the same as the tablet to avoid", topoproto.TabletAliasString(opts.NewPrimaryAlias)) 155 } 156 157 if opts.NewPrimaryAlias == nil { 158 // We don't want to fail when both ShardInfo.PrimaryAlias and AvoidPrimaryAlias are nil. 159 // This happens when we are using PRS to initialize the cluster without specifying the NewPrimaryAlias 160 if ev.ShardInfo.PrimaryAlias != nil && !topoproto.TabletAliasEqual(opts.AvoidPrimaryAlias, ev.ShardInfo.PrimaryAlias) { 161 event.DispatchUpdate(ev, "current primary is different than tablet to avoid, nothing to do") 162 return true, nil 163 } 164 165 event.DispatchUpdate(ev, "searching for primary candidate") 166 167 opts.NewPrimaryAlias, err = ChooseNewPrimary(ctx, pr.tmc, &ev.ShardInfo, tabletMap, opts.AvoidPrimaryAlias, opts.WaitReplicasTimeout, opts.durability, pr.logger) 168 if err != nil { 169 return true, err 170 } 171 172 if opts.NewPrimaryAlias == nil { 173 return true, vterrors.Errorf(vtrpc.Code_INTERNAL, "cannot find a tablet to reparent to in the same cell as the current primary") 174 } 175 176 pr.logger.Infof("elected new primary candidate %v", topoproto.TabletAliasString(opts.NewPrimaryAlias)) 177 event.DispatchUpdate(ev, "elected new primary candidate") 178 } 179 180 primaryElectAliasStr := topoproto.TabletAliasString(opts.NewPrimaryAlias) 181 182 newPrimaryTabletInfo, ok := tabletMap[primaryElectAliasStr] 183 if !ok { 184 return true, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "primary-elect tablet %v is not in the shard", primaryElectAliasStr) 185 } 186 187 // PRS is only meant to be called when all the tablets are healthy. 188 // So we assume that all the tablets are reachable and check if the primary elect will be able 189 // to make progress if it is promoted. This is needed because sometimes users may ask to promote 190 // a tablet which can never make progress. For example, let's say the user has a durability policy 191 // where they require 2 semi-sync acks but from cross-cell replicas. 192 // Let's say they have 3 replicas A in zone 1 and B and C in zone 2. In this case, A is the only 193 // eligible primary elect. Both B and C won't be able to make forward progress if they are promoted. 194 var tabletsReachable []*topodatapb.Tablet 195 for _, info := range tabletMap { 196 tabletsReachable = append(tabletsReachable, info.Tablet) 197 } 198 if !canEstablishForTablet(opts.durability, newPrimaryTabletInfo.Tablet, tabletsReachable) { 199 return true, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "primary-elect tablet %v won't be able to make forward progress on promotion", primaryElectAliasStr) 200 } 201 202 ev.NewPrimary = proto.Clone(newPrimaryTabletInfo.Tablet).(*topodatapb.Tablet) 203 204 return false, nil 205 } 206 207 func (pr *PlannedReparenter) performGracefulPromotion( 208 ctx context.Context, 209 ev *events.Reparent, 210 keyspace string, 211 shard string, 212 currentPrimary *topo.TabletInfo, 213 primaryElect *topodatapb.Tablet, 214 tabletMap map[string]*topo.TabletInfo, 215 opts PlannedReparentOptions, 216 ) (string, error) { 217 primaryElectAliasStr := topoproto.TabletAliasString(primaryElect.Alias) 218 ev.OldPrimary = proto.Clone(currentPrimary.Tablet).(*topodatapb.Tablet) 219 220 // Before demoting the old primary, we're going to ensure that replication 221 // is working from the old primary to the primary-elect. If replication is 222 // not working, a PlannedReparent is not safe to do, because the candidate 223 // won't catch up and we'll potentially miss transactions. 224 pr.logger.Infof("checking replication on primary-elect %v", primaryElectAliasStr) 225 226 // First, we find the position of the current primary. Note that this is 227 // just a snapshot of the position, since we let it keep accepting writes 228 // until we're sure we want to proceed with the promotion. 229 snapshotCtx, snapshotCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout) 230 defer snapshotCancel() 231 232 snapshotPos, err := pr.tmc.PrimaryPosition(snapshotCtx, currentPrimary.Tablet) 233 if err != nil { 234 return "", vterrors.Wrapf(err, "cannot get replication position on current primary %v; current primary must be healthy to perform PlannedReparent", currentPrimary.AliasString()) 235 } 236 237 // Next, we wait for the primary-elect to catch up to that snapshot point. 238 // If it can catch up within WaitReplicasTimeout, we can be fairly 239 // confident that it will catch up on everything else that happens between 240 // the snapshot point we grabbed above and when we demote the old primary 241 // below. 242 // 243 // We do this as an idempotent SetReplicationSource to make sure the replica knows who 244 // the current primary is. 245 setSourceCtx, setSourceCancel := context.WithTimeout(ctx, opts.WaitReplicasTimeout) 246 defer setSourceCancel() 247 248 if err := pr.tmc.SetReplicationSource(setSourceCtx, primaryElect, currentPrimary.Alias, 0, snapshotPos, true, IsReplicaSemiSync(opts.durability, currentPrimary.Tablet, primaryElect)); err != nil { 249 return "", vterrors.Wrapf(err, "replication on primary-elect %v did not catch up in time; replication must be healthy to perform PlannedReparent", primaryElectAliasStr) 250 } 251 252 // Verify we still have the topology lock before doing the demotion. 253 if err := topo.CheckShardLocked(ctx, keyspace, shard); err != nil { 254 return "", vterrors.Wrap(err, "lost topology lock; aborting") 255 } 256 257 // Next up, demote the current primary and get its replication position. 258 // It's fine if the current primary was already demoted, since DemotePrimary 259 // is idempotent. 260 pr.logger.Infof("demoting current primary: %v", currentPrimary.AliasString()) 261 event.DispatchUpdate(ev, "demoting old primary") 262 263 demoteCtx, demoteCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout) 264 defer demoteCancel() 265 266 primaryStatus, err := pr.tmc.DemotePrimary(demoteCtx, currentPrimary.Tablet) 267 if err != nil { 268 return "", vterrors.Wrapf(err, "failed to DemotePrimary on current primary %v: %v", currentPrimary.AliasString(), err) 269 } 270 271 // Wait for the primary-elect to catch up to the position we demoted the 272 // current primary at. If it fails to catch up within WaitReplicasTimeout, 273 // we will try to roll back to the original primary before aborting. 274 waitCtx, waitCancel := context.WithTimeout(ctx, opts.WaitReplicasTimeout) 275 defer waitCancel() 276 277 waitErr := pr.tmc.WaitForPosition(waitCtx, primaryElect, primaryStatus.Position) 278 279 // Do some wrapping of errors to get the right codes and callstacks. 280 var finalWaitErr error 281 switch { 282 case waitErr != nil: 283 finalWaitErr = vterrors.Wrapf(waitErr, "primary-elect tablet %v failed to catch up with replication %v", primaryElectAliasStr, primaryStatus.Position) 284 case ctx.Err() == context.DeadlineExceeded: 285 finalWaitErr = vterrors.New(vtrpc.Code_DEADLINE_EXCEEDED, "PlannedReparent timed out; please try again") 286 } 287 288 if finalWaitErr != nil { 289 // It's possible that we've used up the calling context's timeout, or 290 // that not enough time is left on the it to finish the rollback. 291 // We create a new background context to avoid a partial rollback, which 292 // could leave the cluster in a worse state than when we started. 293 undoCtx, undoCancel := context.WithTimeout(context.Background(), topo.RemoteOperationTimeout) 294 defer undoCancel() 295 296 if undoErr := pr.tmc.UndoDemotePrimary(undoCtx, currentPrimary.Tablet, SemiSyncAckers(opts.durability, currentPrimary.Tablet) > 0); undoErr != nil { 297 pr.logger.Warningf("encountered error while performing UndoDemotePrimary(%v): %v", currentPrimary.AliasString(), undoErr) 298 finalWaitErr = vterrors.Wrapf(finalWaitErr, "encountered error while performing UndoDemotePrimary(%v): %v", currentPrimary.AliasString(), undoErr) 299 } 300 301 return "", finalWaitErr 302 } 303 304 // Primary-elect is caught up to the current primary. We can do the 305 // promotion now. 306 promoteCtx, promoteCancel := context.WithTimeout(ctx, opts.WaitReplicasTimeout) 307 defer promoteCancel() 308 309 rp, err := pr.tmc.PromoteReplica(promoteCtx, primaryElect, SemiSyncAckers(opts.durability, primaryElect) > 0) 310 if err != nil { 311 return "", vterrors.Wrapf(err, "primary-elect tablet %v failed to be promoted to primary; please try again", primaryElectAliasStr) 312 } 313 314 if ctx.Err() == context.DeadlineExceeded { 315 // PromoteReplica succeeded, but we ran out of time. PRS needs to be 316 // re-run to complete fully. 317 return "", vterrors.Errorf(vtrpc.Code_DEADLINE_EXCEEDED, "PLannedReparent timed out after successfully promoting primary-elect %v; please re-run to fix up the replicas", primaryElectAliasStr) 318 } 319 320 return rp, nil 321 } 322 323 func (pr *PlannedReparenter) performInitialPromotion( 324 ctx context.Context, 325 primaryElect *topodatapb.Tablet, 326 opts PlannedReparentOptions, 327 ) (string, error) { 328 primaryElectAliasStr := topoproto.TabletAliasString(primaryElect.Alias) 329 promoteCtx, promoteCancel := context.WithTimeout(ctx, opts.WaitReplicasTimeout) 330 defer promoteCancel() 331 332 // During the initialization phase we have to use InitPrimary instead of PromoteReplica 333 // This is because the two operations while being largely similar have a very subtle difference 334 // InitPrimary first sets the MySQL instance to read-write and creates the database (if it does not exist) 335 // before it fixes the semi sync. 336 // PromoteReplica on the other hand, first fixes semi-sync before setting the MySQL instance to read-write. 337 // This is done to guarantee safety, in the sense that the semi-sync is on before we start accepting writes. 338 // However, during initialization, it is likely that the database would not be created in the MySQL instance. 339 // Therefore, we have to first set read-write mode, create the database and then fix semi-sync, otherwise we get blocked. 340 rp, err := pr.tmc.InitPrimary(promoteCtx, primaryElect, SemiSyncAckers(opts.durability, primaryElect) > 0) 341 if err != nil { 342 return "", vterrors.Wrapf(err, "primary-elect tablet %v failed to be promoted to primary; please try again", primaryElectAliasStr) 343 } 344 345 if ctx.Err() == context.DeadlineExceeded { 346 // InitPrimary succeeded, but we ran out of time. PRS needs to be 347 // re-run to complete fully. 348 return "", vterrors.Errorf(vtrpc.Code_DEADLINE_EXCEEDED, "PLannedReparent timed out after successfully promoting primary-elect %v; please re-run to fix up the replicas", primaryElectAliasStr) 349 } 350 351 return rp, nil 352 } 353 354 func (pr *PlannedReparenter) performPartialPromotionRecovery(ctx context.Context, primaryElect *topodatapb.Tablet) (string, error) { 355 // It's possible that a previous attempt to reparent failed to SetReadWrite, 356 // so call it here to make sure the underlying MySQL is read-write on the 357 // candidate primary. 358 setReadWriteCtx, setReadWriteCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout) 359 defer setReadWriteCancel() 360 361 if err := pr.tmc.SetReadWrite(setReadWriteCtx, primaryElect); err != nil { 362 return "", vterrors.Wrapf(err, "failed to SetReadWrite on current primary %v", topoproto.TabletAliasString(primaryElect.Alias)) 363 } 364 365 // The primary is already the one we want according to its tablet record. 366 refreshCtx, refreshCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout) 367 defer refreshCancel() 368 369 // Get the replication position so we can try to fix the replicas (back in 370 // reparentShardLocked()) 371 reparentJournalPosition, err := pr.tmc.PrimaryPosition(refreshCtx, primaryElect) 372 if err != nil { 373 return "", vterrors.Wrapf(err, "failed to get replication position of current primary %v", topoproto.TabletAliasString(primaryElect.Alias)) 374 } 375 376 return reparentJournalPosition, nil 377 } 378 379 func (pr *PlannedReparenter) performPotentialPromotion( 380 ctx context.Context, 381 keyspace string, 382 shard string, 383 primaryElect *topodatapb.Tablet, 384 tabletMap map[string]*topo.TabletInfo, 385 opts PlannedReparentOptions, 386 ) (string, error) { 387 primaryElectAliasStr := topoproto.TabletAliasString(primaryElect.Alias) 388 389 pr.logger.Infof("no clear winner found for current primary term; checking if it's safe to recover by electing %v", primaryElectAliasStr) 390 391 type tabletPos struct { 392 alias string 393 tablet *topodatapb.Tablet 394 pos mysql.Position 395 } 396 397 positions := make(chan tabletPos, len(tabletMap)) 398 399 // First, stop the world, to ensure no writes are happening anywhere. We 400 // don't trust that we know which tablets might be acting as primaries, so 401 // we simply demote everyone. 402 // 403 // Unlike the normal, single-primary case, we don't try to undo this if we 404 // fail. If we've made it here, it means there is no clear primary, so we 405 // don't know who it's safe to roll back to. Leaving everything read-only is 406 // probably safer, or at least no worse, than whatever weird state we were 407 // in before. 408 // 409 // If any tablets are unreachable, we can't be sure it's safe either, 410 // because one of the unreachable tablets might have a replication position 411 // further ahead than the candidate primary. 412 413 var ( 414 stopAllWg sync.WaitGroup 415 rec concurrency.AllErrorRecorder 416 ) 417 418 stopAllCtx, stopAllCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout) 419 defer stopAllCancel() 420 421 for alias, tabletInfo := range tabletMap { 422 stopAllWg.Add(1) 423 424 go func(alias string, tablet *topodatapb.Tablet) { 425 defer stopAllWg.Done() 426 427 // Regardless of what type this tablet thinks it is, we will always 428 // call DemotePrimary to ensure the underlying MySQL server is in 429 // read-only, and to check its replication position. DemotePrimary is 430 // idempotent, so it's fine to call it on a replica (or other 431 // tablet type), that's already in read-only. 432 pr.logger.Infof("demoting tablet %v", alias) 433 434 primaryStatus, err := pr.tmc.DemotePrimary(stopAllCtx, tablet) 435 if err != nil { 436 rec.RecordError(vterrors.Wrapf(err, "DemotePrimary(%v) failed on contested primary", alias)) 437 438 return 439 } 440 441 pos, err := mysql.DecodePosition(primaryStatus.Position) 442 if err != nil { 443 rec.RecordError(vterrors.Wrapf(err, "cannot decode replication position (%v) for demoted tablet %v", primaryStatus.Position, alias)) 444 445 return 446 } 447 448 positions <- tabletPos{ 449 alias: alias, 450 tablet: tablet, 451 pos: pos, 452 } 453 }(alias, tabletInfo.Tablet) 454 } 455 456 stopAllWg.Wait() 457 close(positions) 458 459 if rec.HasErrors() { 460 return "", vterrors.Wrap(rec.Error(), "failed to demote all tablets") 461 } 462 463 // Construct a mapping of alias to tablet position. 464 tabletPosMap := make(map[string]tabletPos, len(tabletMap)) 465 for tp := range positions { 466 tabletPosMap[tp.alias] = tp 467 } 468 469 // Make sure no tablet has a more advanced position than the candidate 470 // primary. It's up to the caller to choose a suitable candidate, and to 471 // choose another if this check fails. 472 // 473 // Note that we still allow replication to run during this time, but we 474 // assume that no new high water mark can appear because we just demoted all 475 // tablets to read-only, so there should be no new transactions. 476 // 477 // TODO: consider temporarily replicating from another tablet to catch up, 478 // if the candidate primary is behind that tablet. 479 tp, ok := tabletPosMap[primaryElectAliasStr] 480 if !ok { 481 return "", vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "primary-elect tablet %v not found in tablet map", primaryElectAliasStr) 482 } 483 484 primaryElectPos := tp.pos 485 486 for _, tp := range tabletPosMap { 487 // The primary-elect pos has to be at least as advanced as every tablet 488 // in the shard. 489 if !primaryElectPos.AtLeast(tp.pos) { 490 return "", vterrors.Errorf( 491 vtrpc.Code_FAILED_PRECONDITION, 492 "tablet %v (position: %v) contains transactions not found in primary-elect %v (position: %v)", 493 tp.alias, tp.pos, primaryElectAliasStr, primaryElectPos, 494 ) 495 } 496 } 497 498 // Check that we still have the topology lock. 499 if err := topo.CheckShardLocked(ctx, keyspace, shard); err != nil { 500 return "", vterrors.Wrap(err, "lost topology lock; aborting") 501 } 502 503 // Promote the candidate primary to type:PRIMARY. 504 promoteCtx, promoteCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout) 505 defer promoteCancel() 506 507 rp, err := pr.tmc.PromoteReplica(promoteCtx, primaryElect, SemiSyncAckers(opts.durability, primaryElect) > 0) 508 if err != nil { 509 return "", vterrors.Wrapf(err, "failed to promote %v to primary", primaryElectAliasStr) 510 } 511 512 return rp, nil 513 } 514 515 func (pr *PlannedReparenter) reparentShardLocked( 516 ctx context.Context, 517 ev *events.Reparent, 518 keyspace string, 519 shard string, 520 opts PlannedReparentOptions, 521 ) error { 522 shardInfo, err := pr.ts.GetShard(ctx, keyspace, shard) 523 if err != nil { 524 return err 525 } 526 527 keyspaceDurability, err := pr.ts.GetKeyspaceDurability(ctx, keyspace) 528 if err != nil { 529 return err 530 } 531 532 pr.logger.Infof("Getting a new durability policy for %v", keyspaceDurability) 533 opts.durability, err = GetDurabilityPolicy(keyspaceDurability) 534 if err != nil { 535 return err 536 } 537 538 ev.ShardInfo = *shardInfo 539 540 event.DispatchUpdate(ev, "reading tablet map") 541 542 tabletMap, err := pr.ts.GetTabletMapForShard(ctx, keyspace, shard) 543 if err != nil { 544 return err 545 } 546 547 // Check invariants that PlannedReparentShard depends on. 548 if isNoop, err := pr.preflightChecks(ctx, ev, keyspace, shard, tabletMap, &opts); err != nil { 549 return err 550 } else if isNoop { 551 return nil 552 } 553 554 currentPrimary := FindCurrentPrimary(tabletMap, pr.logger) 555 reparentJournalPos := "" 556 // needsRefresh is used to keep track of whether we need to refresh the state 557 // of the new primary tablet. The only case that we need to reload the state 558 // is when we are initializing the new primary. The reason is that the first 559 // time we try to setup all the components like vreplication.Engine, they fail 560 // since the database isn't created until we setServing. 561 // A call to Refresh state fixes all the components. This isn't strictly necessary 562 // in the sense that all the components will retry initialization anyways after some 563 // time, so even without a call to RefreshState, they all converge correctly. 564 needsRefresh := false 565 566 // Depending on whether we can find a current primary, and what the caller 567 // specified as the candidate primary, we will do one of four kinds of 568 // promotions: 569 // 1) There is no current primary and the shard info also does not have 570 // anything stored. This happens when none of the tablets have ever been promoted. 571 // So we can promote the primary-elect without any issues. After that all we need 572 // to do is to reparent all the tablets to that primary which is accomplished in the 573 // common code path. 574 // 575 // 2) There is no clear current primary. In this case we will try to 576 // determine if it's safe to promote the candidate specified by the caller. 577 // If it's not -- including if any tablet in the shard is unreachable -- we 578 // bail. We also don't attempt to rollback a failed demotion in this case. 579 // 580 // 3) The current primary is the same as the candidate primary specified by 581 // the caller. In this case, we assume there was a previous PRS for this 582 // primary, and the caller is re-issuing the call to fix-up any replicas. We 583 // also idempotently set the desired primary as read-write, just in case. 584 // 585 // 4) The current primary and the desired primary differ. In this case, we 586 // perform a graceful promotion, in which we validate the desired primary is 587 // sufficiently up-to-date, demote the current primary, wait for the desired 588 // primary to catch up to that position, and set the desired primary 589 // read-write. We will attempt to rollback a failed demotion in this case, 590 // unlike in case (1), because we have a known good state to rollback to. 591 // 592 // In all cases, we will retrieve the reparent journal position that was 593 // inserted in the new primary's journal, so we can use it below to check 594 // that all the replicas have attached to new primary successfully. 595 switch { 596 case currentPrimary == nil && ev.ShardInfo.PrimaryAlias == nil: 597 // Case (1): no primary has been elected ever. Initialize 598 // the primary-elect tablet 599 reparentJournalPos, err = pr.performInitialPromotion(ctx, ev.NewPrimary, opts) 600 needsRefresh = true 601 case currentPrimary == nil && ev.ShardInfo.PrimaryAlias != nil: 602 // Case (2): no clear current primary. Try to find a safe promotion 603 // candidate, and promote to it. 604 reparentJournalPos, err = pr.performPotentialPromotion(ctx, keyspace, shard, ev.NewPrimary, tabletMap, opts) 605 case topoproto.TabletAliasEqual(currentPrimary.Alias, opts.NewPrimaryAlias): 606 // Case (3): desired new primary is the current primary. Attempt to fix 607 // up replicas to recover from a previous partial promotion. 608 reparentJournalPos, err = pr.performPartialPromotionRecovery(ctx, ev.NewPrimary) 609 default: 610 // Case (4): desired primary and current primary differ. Do a graceful 611 // demotion-then-promotion. 612 reparentJournalPos, err = pr.performGracefulPromotion(ctx, ev, keyspace, shard, currentPrimary, ev.NewPrimary, tabletMap, opts) 613 } 614 615 if err != nil { 616 return err 617 } 618 619 if err := topo.CheckShardLocked(ctx, keyspace, shard); err != nil { 620 return vterrors.Wrap(err, "lost topology lock, aborting") 621 } 622 623 if err := pr.reparentTablets(ctx, ev, reparentJournalPos, tabletMap, opts); err != nil { 624 return err 625 } 626 627 if needsRefresh { 628 // Refresh the state to force the tabletserver to reconnect after db has been created. 629 if err := pr.tmc.RefreshState(ctx, ev.NewPrimary); err != nil { 630 pr.logger.Warningf("RefreshState failed: %v", err) 631 } 632 } 633 return nil 634 } 635 636 func (pr *PlannedReparenter) reparentTablets( 637 ctx context.Context, 638 ev *events.Reparent, 639 reparentJournalPosition string, 640 tabletMap map[string]*topo.TabletInfo, 641 opts PlannedReparentOptions, 642 ) error { 643 // Create a cancellable context for the entire set of reparent operations. 644 // If any error conditions happen, we can cancel all outgoing RPCs. 645 replCtx, replCancel := context.WithTimeout(ctx, opts.WaitReplicasTimeout) 646 defer replCancel() 647 648 // Go thorugh all the tablets. 649 // - New primary: populate the reparent journal. 650 // - Everybody else: reparent to the new primary; wait for the reparent 651 // journal row. 652 event.DispatchUpdate(ev, "reparenting all tablets") 653 654 // We add a (hopefully) unique record to the reparent journal table on the 655 // new primary, so we can check if replicas got it through replication. 656 reparentJournalTimestamp := time.Now().UnixNano() 657 primaryElectAliasStr := topoproto.TabletAliasString(ev.NewPrimary.Alias) 658 replicasWg := sync.WaitGroup{} 659 rec := concurrency.AllErrorRecorder{} 660 661 // Point all replicas at the new primary and check that they receive the 662 // reparent journal entry, proving that they are replicating from the new 663 // primary. We do this concurrently with adding the journal entry (after 664 // this loop), because if semi-sync is enabled, the update to the journal 665 // table will block until at least one replica is successfully attached to 666 // the new primary. 667 for alias, tabletInfo := range tabletMap { 668 if alias == primaryElectAliasStr { 669 continue 670 } 671 672 replicasWg.Add(1) 673 674 go func(alias string, tablet *topodatapb.Tablet) { 675 defer replicasWg.Done() 676 pr.logger.Infof("setting new primary on replica %v", alias) 677 678 // Note: we used to force replication to start on the old primary, 679 // but now that we support "resuming" a previously-failed PRS 680 // attempt, we can no longer assume that we know who the former 681 // primary was. Instead, we rely on the former primary to remember 682 // that it needs to start replication after transitioning from 683 // PRIMARY => REPLICA. 684 forceStartReplication := false 685 if err := pr.tmc.SetReplicationSource(replCtx, tablet, ev.NewPrimary.Alias, reparentJournalTimestamp, "", forceStartReplication, IsReplicaSemiSync(opts.durability, ev.NewPrimary, tablet)); err != nil { 686 rec.RecordError(vterrors.Wrapf(err, "tablet %v failed to SetReplicationSource(%v): %v", alias, primaryElectAliasStr, err)) 687 } 688 }(alias, tabletInfo.Tablet) 689 } 690 691 // Add a reparent journal entry on the new primary. If semi-sync is enabled, 692 // this blocks until at least one replica is reparented (above) and 693 // successfully replicating from the new primary. 694 // 695 // If we fail to populate the reparent journal, there's no way the replicas 696 // will work, so we cancel the ongoing reparent RPCs and bail out. 697 pr.logger.Infof("populating reparent journal on new primary %v", primaryElectAliasStr) 698 if err := pr.tmc.PopulateReparentJournal(replCtx, ev.NewPrimary, reparentJournalTimestamp, "PlannedReparentShard", ev.NewPrimary.Alias, reparentJournalPosition); err != nil { 699 pr.logger.Warningf("primary failed to PopulateReparentJournal (position: %v); cancelling replica reparent attempts", reparentJournalPosition) 700 replCancel() 701 replicasWg.Wait() 702 703 return vterrors.Wrapf(err, "failed PopulateReparentJournal(primary=%v, ts=%v, pos=%v): %v", primaryElectAliasStr, reparentJournalTimestamp, reparentJournalPosition, err) 704 } 705 706 // Reparent journal has been populated on the new primary. We just need to 707 // wait for all the replicas to receive it. 708 replicasWg.Wait() 709 710 if err := rec.Error(); err != nil { 711 msg := "some replicas failed to reparent; retry PlannedReparentShard with the same new primary alias (%v) to retry failed replicas" 712 pr.logger.Errorf2(err, msg, primaryElectAliasStr) 713 return vterrors.Wrapf(err, msg, primaryElectAliasStr) 714 } 715 716 return nil 717 }