vitess.io/vitess@v0.16.2/go/vt/wrangler/traffic_switcher.go (about) 1 /* 2 Copyright 2019 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package wrangler 18 19 import ( 20 "context" 21 "errors" 22 "fmt" 23 "reflect" 24 "sort" 25 "strings" 26 "sync" 27 "time" 28 29 "vitess.io/vitess/go/sqlescape" 30 "vitess.io/vitess/go/vt/discovery" 31 32 "vitess.io/vitess/go/json2" 33 "vitess.io/vitess/go/vt/binlog/binlogplayer" 34 "vitess.io/vitess/go/vt/concurrency" 35 "vitess.io/vitess/go/vt/key" 36 "vitess.io/vitess/go/vt/log" 37 "vitess.io/vitess/go/vt/logutil" 38 "vitess.io/vitess/go/vt/sqlparser" 39 "vitess.io/vitess/go/vt/topo" 40 "vitess.io/vitess/go/vt/topotools" 41 "vitess.io/vitess/go/vt/vtctl/workflow" 42 "vitess.io/vitess/go/vt/vterrors" 43 "vitess.io/vitess/go/vt/vtgate/vindexes" 44 "vitess.io/vitess/go/vt/vttablet/tabletmanager/vreplication" 45 "vitess.io/vitess/go/vt/vttablet/tmclient" 46 47 binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" 48 querypb "vitess.io/vitess/go/vt/proto/query" 49 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 50 vschemapb "vitess.io/vitess/go/vt/proto/vschema" 51 vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" 52 ) 53 54 const ( 55 errorNoStreams = "no streams found in keyspace %s for: %s" 56 // use pt-osc's naming convention, this format also ensures vstreamer ignores such tables 57 renameTableTemplate = "_%.59s_old" // limit table name to 64 characters 58 59 sqlDeleteWorkflow = "delete from _vt.vreplication where db_name = %s and workflow = %s" 60 ) 61 62 // accessType specifies the type of access for a shard (allow/disallow writes). 63 type accessType int 64 65 const ( 66 allowWrites = accessType(iota) 67 disallowWrites 68 69 // number of LOCK TABLES cycles to perform on the sources during SwitchWrites 70 lockTablesCycles = 2 71 // time to wait between LOCK TABLES cycles on the sources during SwitchWrites 72 lockTablesCycleDelay = time.Duration(100 * time.Millisecond) 73 74 // How long to wait when refreshing the state of each tablet in a shard. Note that these 75 // are refreshed in parallel, non-topo errors are ignored (in the error handling) and we 76 // may only do a partial refresh. Because in some cases it's unsafe to switch the traffic 77 // if some tablets do not refresh, we may need to look for partial results and produce 78 // an error (with the provided details of WHY) if we see them. 79 // Side note: the default lock/lease TTL in etcd is 60s so the default tablet refresh 80 // timeout of 60s can cause us to lose our keyspace lock before completing the 81 // operation too. 82 shardTabletRefreshTimeout = time.Duration(30 * time.Second) 83 ) 84 85 // trafficSwitcher contains the metadata for switching read and write traffic 86 // for vreplication streams. 87 type trafficSwitcher struct { 88 migrationType binlogdatapb.MigrationType 89 isPartialMigration bool 90 wr *Wrangler 91 workflow string 92 93 // if frozen is true, the rest of the fields are not set. 94 frozen bool 95 reverseWorkflow string 96 id int64 97 sources map[string]*workflow.MigrationSource 98 targets map[string]*workflow.MigrationTarget 99 sourceKeyspace string 100 targetKeyspace string 101 tables []string 102 keepRoutingRules bool 103 sourceKSSchema *vindexes.KeyspaceSchema 104 optCells string //cells option passed to MoveTables/Reshard 105 optTabletTypes string //tabletTypes option passed to MoveTables/Reshard 106 externalCluster string 107 externalTopo *topo.Server 108 sourceTimeZone string 109 targetTimeZone string 110 workflowType binlogdatapb.VReplicationWorkflowType 111 workflowSubType binlogdatapb.VReplicationWorkflowSubType 112 } 113 114 /* 115 begin: implementation of workflow.ITrafficSwitcher 116 117 (NOTE:@ajm188) Please see comments on that interface type for why this exists. 118 This is temporary to allow workflow.StreamMigrator to use this trafficSwitcher 119 code and should be removed in the very near-term when we move trafficSwitcher to 120 package workflow as well. 121 */ 122 123 var _ workflow.ITrafficSwitcher = (*trafficSwitcher)(nil) 124 125 func (ts *trafficSwitcher) TopoServer() *topo.Server { return ts.wr.ts } 126 func (ts *trafficSwitcher) TabletManagerClient() tmclient.TabletManagerClient { return ts.wr.tmc } 127 func (ts *trafficSwitcher) Logger() logutil.Logger { return ts.wr.logger } 128 func (ts *trafficSwitcher) VReplicationExec(ctx context.Context, alias *topodatapb.TabletAlias, query string) (*querypb.QueryResult, error) { 129 return ts.wr.VReplicationExec(ctx, alias, query) 130 } 131 132 func (ts *trafficSwitcher) ExternalTopo() *topo.Server { return ts.externalTopo } 133 func (ts *trafficSwitcher) MigrationType() binlogdatapb.MigrationType { return ts.migrationType } 134 func (ts *trafficSwitcher) IsPartialMigration() bool { return ts.isPartialMigration } 135 func (ts *trafficSwitcher) ReverseWorkflowName() string { return ts.reverseWorkflow } 136 func (ts *trafficSwitcher) SourceKeyspaceName() string { return ts.sourceKSSchema.Keyspace.Name } 137 func (ts *trafficSwitcher) SourceKeyspaceSchema() *vindexes.KeyspaceSchema { return ts.sourceKSSchema } 138 func (ts *trafficSwitcher) Sources() map[string]*workflow.MigrationSource { return ts.sources } 139 func (ts *trafficSwitcher) Tables() []string { return ts.tables } 140 func (ts *trafficSwitcher) TargetKeyspaceName() string { return ts.targetKeyspace } 141 func (ts *trafficSwitcher) Targets() map[string]*workflow.MigrationTarget { return ts.targets } 142 func (ts *trafficSwitcher) WorkflowName() string { return ts.workflow } 143 func (ts *trafficSwitcher) SourceTimeZone() string { return ts.sourceTimeZone } 144 func (ts *trafficSwitcher) TargetTimeZone() string { return ts.targetTimeZone } 145 146 func (ts *trafficSwitcher) ForAllSources(f func(source *workflow.MigrationSource) error) error { 147 var wg sync.WaitGroup 148 allErrors := &concurrency.AllErrorRecorder{} 149 for _, source := range ts.sources { 150 wg.Add(1) 151 go func(source *workflow.MigrationSource) { 152 defer wg.Done() 153 154 if err := f(source); err != nil { 155 allErrors.RecordError(err) 156 } 157 }(source) 158 } 159 wg.Wait() 160 return allErrors.AggrError(vterrors.Aggregate) 161 } 162 163 func (ts *trafficSwitcher) ForAllTargets(f func(source *workflow.MigrationTarget) error) error { 164 var wg sync.WaitGroup 165 allErrors := &concurrency.AllErrorRecorder{} 166 for _, target := range ts.targets { 167 wg.Add(1) 168 go func(target *workflow.MigrationTarget) { 169 defer wg.Done() 170 171 if err := f(target); err != nil { 172 allErrors.RecordError(err) 173 } 174 }(target) 175 } 176 wg.Wait() 177 return allErrors.AggrError(vterrors.Aggregate) 178 } 179 180 func (ts *trafficSwitcher) ForAllUIDs(f func(target *workflow.MigrationTarget, uid uint32) error) error { 181 var wg sync.WaitGroup 182 allErrors := &concurrency.AllErrorRecorder{} 183 for _, target := range ts.Targets() { 184 for uid := range target.Sources { 185 wg.Add(1) 186 go func(target *workflow.MigrationTarget, uid uint32) { 187 defer wg.Done() 188 189 if err := f(target, uid); err != nil { 190 allErrors.RecordError(err) 191 } 192 }(target, uid) 193 } 194 } 195 wg.Wait() 196 return allErrors.AggrError(vterrors.Aggregate) 197 } 198 199 /* end: implementation of workflow.ITrafficSwitcher */ 200 201 func (wr *Wrangler) getWorkflowState(ctx context.Context, targetKeyspace, workflowName string) (*trafficSwitcher, *workflow.State, error) { 202 ts, err := wr.buildTrafficSwitcher(ctx, targetKeyspace, workflowName) 203 204 if ts == nil || err != nil { 205 if errors.Is(err, workflow.ErrNoStreams) || err.Error() == fmt.Sprintf(errorNoStreams, targetKeyspace, workflowName) { 206 return nil, nil, nil 207 } 208 wr.Logger().Errorf("buildTrafficSwitcher failed: %v", err) 209 return nil, nil, err 210 } 211 212 ws := workflow.NewServer(wr.ts, wr.tmc) 213 state := &workflow.State{ 214 Workflow: workflowName, 215 SourceKeyspace: ts.SourceKeyspaceName(), 216 TargetKeyspace: targetKeyspace, 217 IsPartialMigration: ts.isPartialMigration, 218 } 219 220 var ( 221 reverse bool 222 keyspace string 223 ) 224 225 // We reverse writes by using the source_keyspace.workflowname_reverse workflow 226 // spec, so we need to use the source of the reverse workflow, which is the 227 // target of the workflow initiated by the user for checking routing rules. 228 // Similarly we use a target shard of the reverse workflow as the original 229 // source to check if writes have been switched. 230 if strings.HasSuffix(workflowName, "_reverse") { 231 reverse = true 232 keyspace = state.SourceKeyspace 233 workflowName = workflow.ReverseWorkflowName(workflowName) 234 } else { 235 keyspace = targetKeyspace 236 } 237 if ts.MigrationType() == binlogdatapb.MigrationType_TABLES { 238 state.WorkflowType = workflow.TypeMoveTables 239 240 // We assume a consistent state, so only choose routing rule for one table. 241 if len(ts.Tables()) == 0 { 242 return nil, nil, fmt.Errorf("no tables in workflow %s.%s", keyspace, workflowName) 243 244 } 245 table := ts.Tables()[0] 246 247 if ts.isPartialMigration { // shard level traffic switching is all or nothing 248 shardRoutingRules, err := wr.ts.GetShardRoutingRules(ctx) 249 if err != nil { 250 return nil, nil, err 251 } 252 253 rules := shardRoutingRules.Rules 254 for _, rule := range rules { 255 if rule.ToKeyspace == ts.SourceKeyspaceName() { 256 state.ShardsNotYetSwitched = append(state.ShardsNotYetSwitched, rule.Shard) 257 } else { 258 state.ShardsAlreadySwitched = append(state.ShardsAlreadySwitched, rule.Shard) 259 } 260 } 261 } else { 262 state.RdonlyCellsSwitched, state.RdonlyCellsNotSwitched, err = ws.GetCellsWithTableReadsSwitched(ctx, keyspace, table, topodatapb.TabletType_RDONLY) 263 if err != nil { 264 return nil, nil, err 265 } 266 267 state.ReplicaCellsSwitched, state.ReplicaCellsNotSwitched, err = ws.GetCellsWithTableReadsSwitched(ctx, keyspace, table, topodatapb.TabletType_REPLICA) 268 if err != nil { 269 return nil, nil, err 270 } 271 globalRules, err := topotools.GetRoutingRules(ctx, ts.TopoServer()) 272 if err != nil { 273 return nil, nil, err 274 } 275 for _, table := range ts.Tables() { 276 rr := globalRules[table] 277 // if a rule exists for the table and points to the target keyspace, writes have been switched 278 if len(rr) > 0 && rr[0] == fmt.Sprintf("%s.%s", keyspace, table) { 279 state.WritesSwitched = true 280 break 281 } 282 } 283 } 284 } else { 285 state.WorkflowType = workflow.TypeReshard 286 287 // we assume a consistent state, so only choose one shard 288 var shard *topo.ShardInfo 289 if reverse { 290 shard = ts.TargetShards()[0] 291 } else { 292 shard = ts.SourceShards()[0] 293 } 294 295 state.RdonlyCellsSwitched, state.RdonlyCellsNotSwitched, err = ws.GetCellsWithShardReadsSwitched(ctx, keyspace, shard, topodatapb.TabletType_RDONLY) 296 if err != nil { 297 return nil, nil, err 298 } 299 300 state.ReplicaCellsSwitched, state.ReplicaCellsNotSwitched, err = ws.GetCellsWithShardReadsSwitched(ctx, keyspace, shard, topodatapb.TabletType_REPLICA) 301 if err != nil { 302 return nil, nil, err 303 } 304 305 if !shard.IsPrimaryServing { 306 state.WritesSwitched = true 307 } 308 } 309 310 return ts, state, nil 311 } 312 313 // SwitchReads is a generic way of switching read traffic for a resharding workflow. 314 func (wr *Wrangler) SwitchReads(ctx context.Context, targetKeyspace, workflowName string, servedTypes []topodatapb.TabletType, 315 cells []string, direction workflow.TrafficSwitchDirection, dryRun bool) (*[]string, error) { 316 317 ts, ws, err := wr.getWorkflowState(ctx, targetKeyspace, workflowName) 318 if err != nil { 319 wr.Logger().Errorf("getWorkflowState failed: %v", err) 320 return nil, err 321 } 322 if ts == nil { 323 errorMsg := fmt.Sprintf("workflow %s not found in keyspace %s", workflowName, targetKeyspace) 324 wr.Logger().Errorf(errorMsg) 325 return nil, fmt.Errorf(errorMsg) 326 } 327 log.Infof("Switching reads: %s.%s tt %+v, cells %+v, workflow state: %+v", targetKeyspace, workflowName, servedTypes, cells, ws) 328 var switchReplicas, switchRdonly bool 329 for _, servedType := range servedTypes { 330 if servedType != topodatapb.TabletType_REPLICA && servedType != topodatapb.TabletType_RDONLY { 331 return nil, fmt.Errorf("tablet type must be REPLICA or RDONLY: %v", servedType) 332 } 333 if direction == workflow.DirectionBackward && servedType == topodatapb.TabletType_REPLICA && len(ws.ReplicaCellsSwitched) == 0 { 334 return nil, fmt.Errorf("requesting reversal of read traffic for REPLICAs but REPLICA reads have not been switched") 335 } 336 if direction == workflow.DirectionBackward && servedType == topodatapb.TabletType_RDONLY && len(ws.RdonlyCellsSwitched) == 0 { 337 return nil, fmt.Errorf("requesting reversal of SwitchReads for RDONLYs but RDONLY reads have not been switched") 338 } 339 switch servedType { 340 case topodatapb.TabletType_REPLICA: 341 switchReplicas = true 342 case topodatapb.TabletType_RDONLY: 343 switchRdonly = true 344 } 345 } 346 347 // if there are no rdonly tablets in the cells ask to switch rdonly tablets as well so that routing rules 348 // are updated for rdonly as well. Otherwise vitess will not know that the workflow has completed and will 349 // incorrectly report that not all reads have been switched. User currently is forced to switch non-existent rdonly tablets 350 if switchReplicas && !switchRdonly { 351 var err error 352 rdonlyTabletsExist, err := topotools.DoCellsHaveRdonlyTablets(ctx, wr.ts, cells) 353 if err != nil { 354 return nil, err 355 } 356 if !rdonlyTabletsExist { 357 servedTypes = append(servedTypes, topodatapb.TabletType_RDONLY) 358 } 359 } 360 361 // If journals exist notify user and fail 362 journalsExist, _, err := ts.checkJournals(ctx) 363 if err != nil { 364 wr.Logger().Errorf("checkJournals failed: %v", err) 365 return nil, err 366 } 367 if journalsExist { 368 log.Infof("Found a previous journal entry for %d", ts.id) 369 } 370 var sw iswitcher 371 if dryRun { 372 sw = &switcherDryRun{ts: ts, drLog: NewLogRecorder()} 373 } else { 374 sw = &switcher{ts: ts, wr: wr} 375 } 376 377 if err := ts.validate(ctx); err != nil { 378 ts.Logger().Errorf("validate failed: %v", err) 379 return nil, err 380 } 381 382 // For reads, locking the source keyspace is sufficient. 383 ctx, unlock, lockErr := sw.lockKeyspace(ctx, ts.SourceKeyspaceName(), "SwitchReads") 384 if lockErr != nil { 385 ts.Logger().Errorf("LockKeyspace failed: %v", lockErr) 386 return nil, lockErr 387 } 388 defer unlock(&err) 389 390 if ts.MigrationType() == binlogdatapb.MigrationType_TABLES { 391 if ts.isPartialMigration { 392 ts.Logger().Infof("Partial migration, skipping switchTableReads as traffic is all or nothing per shard and overridden for reads AND writes in the ShardRoutingRule created when switching writes.") 393 } else if err := sw.switchTableReads(ctx, cells, servedTypes, direction); err != nil { 394 ts.Logger().Errorf("switchTableReads failed: %v", err) 395 return nil, err 396 } 397 return sw.logs(), nil 398 } 399 wr.Logger().Infof("About to switchShardReads: %+v, %+v, %+v", cells, servedTypes, direction) 400 if err := ts.switchShardReads(ctx, cells, servedTypes, direction); err != nil { 401 ts.Logger().Errorf("switchShardReads failed: %v", err) 402 return nil, err 403 } 404 405 wr.Logger().Infof("switchShardReads Completed: %+v, %+v, %+v", cells, servedTypes, direction) 406 if err := wr.ts.ValidateSrvKeyspace(ctx, targetKeyspace, strings.Join(cells, ",")); err != nil { 407 err2 := vterrors.Wrapf(err, "After switching shard reads, found SrvKeyspace for %s is corrupt in cell %s", 408 targetKeyspace, strings.Join(cells, ",")) 409 log.Errorf("%w", err2) 410 return nil, err2 411 } 412 return sw.logs(), nil 413 } 414 415 func (wr *Wrangler) areTabletsAvailableToStreamFrom(ctx context.Context, ts *trafficSwitcher, keyspace string, shards []*topo.ShardInfo) error { 416 var cells []string 417 tabletTypes := ts.optTabletTypes 418 if ts.optCells != "" { 419 cells = strings.Split(ts.optCells, ",") 420 } 421 // FIXME: currently there is a default setting in the tablet that is used if user does not specify a tablet type, 422 // we use the value specified in the tablet flag `-vreplication_tablet_type` 423 // but ideally we should populate the vreplication table with a default value when we setup the workflow 424 if tabletTypes == "" { 425 tabletTypes = "PRIMARY,REPLICA" 426 } 427 428 var wg sync.WaitGroup 429 allErrors := &concurrency.AllErrorRecorder{} 430 for _, shard := range shards { 431 wg.Add(1) 432 go func(cells []string, keyspace string, shard *topo.ShardInfo) { 433 defer wg.Done() 434 if cells == nil { 435 cells = append(cells, shard.PrimaryAlias.Cell) 436 } 437 tp, err := discovery.NewTabletPicker(wr.ts, cells, keyspace, shard.ShardName(), tabletTypes) 438 if err != nil { 439 allErrors.RecordError(err) 440 return 441 } 442 tablets := tp.GetMatchingTablets(ctx) 443 if len(tablets) == 0 { 444 allErrors.RecordError(fmt.Errorf("no tablet found to source data in keyspace %s, shard %s", keyspace, shard.ShardName())) 445 return 446 } 447 }(cells, keyspace, shard) 448 } 449 450 wg.Wait() 451 if allErrors.HasErrors() { 452 log.Errorf("%s", allErrors.Error()) 453 return allErrors.Error() 454 } 455 return nil 456 } 457 458 // SwitchWrites is a generic way of migrating write traffic for a resharding workflow. 459 func (wr *Wrangler) SwitchWrites(ctx context.Context, targetKeyspace, workflowName string, timeout time.Duration, 460 cancel, reverse, reverseReplication bool, dryRun bool) (journalID int64, dryRunResults *[]string, err error) { 461 ts, ws, err := wr.getWorkflowState(ctx, targetKeyspace, workflowName) 462 _ = ws 463 if err != nil { 464 wr.Logger().Errorf("getWorkflowState failed: %v", err) 465 return 0, nil, err 466 } 467 if ts == nil { 468 errorMsg := fmt.Sprintf("workflow %s not found in keyspace %s", workflowName, targetKeyspace) 469 wr.Logger().Errorf(errorMsg) 470 return 0, nil, fmt.Errorf(errorMsg) 471 } 472 473 var sw iswitcher 474 if dryRun { 475 sw = &switcherDryRun{ts: ts, drLog: NewLogRecorder()} 476 } else { 477 sw = &switcher{ts: ts, wr: wr} 478 } 479 480 if ts.frozen { 481 ts.Logger().Warningf("Writes have already been switched for workflow %s, nothing to do here", ts.WorkflowName()) 482 return 0, sw.logs(), nil 483 } 484 485 ts.Logger().Infof("Built switching metadata: %+v", ts) 486 if err := ts.validate(ctx); err != nil { 487 ts.Logger().Errorf("validate failed: %v", err) 488 return 0, nil, err 489 } 490 491 if reverseReplication { 492 err := wr.areTabletsAvailableToStreamFrom(ctx, ts, ts.TargetKeyspaceName(), ts.TargetShards()) 493 if err != nil { 494 return 0, nil, err 495 } 496 } 497 498 // Need to lock both source and target keyspaces. 499 tctx, sourceUnlock, lockErr := sw.lockKeyspace(ctx, ts.SourceKeyspaceName(), "SwitchWrites") 500 if lockErr != nil { 501 ts.Logger().Errorf("LockKeyspace failed: %v", lockErr) 502 return 0, nil, lockErr 503 } 504 ctx = tctx 505 defer sourceUnlock(&err) 506 if ts.TargetKeyspaceName() != ts.SourceKeyspaceName() { 507 tctx, targetUnlock, lockErr := sw.lockKeyspace(ctx, ts.TargetKeyspaceName(), "SwitchWrites") 508 if lockErr != nil { 509 ts.Logger().Errorf("LockKeyspace failed: %v", lockErr) 510 return 0, nil, lockErr 511 } 512 ctx = tctx 513 defer targetUnlock(&err) 514 } 515 516 // If no journals exist, sourceWorkflows will be initialized by sm.MigrateStreams. 517 journalsExist, sourceWorkflows, err := ts.checkJournals(ctx) 518 if err != nil { 519 ts.Logger().Errorf("checkJournals failed: %v", err) 520 return 0, nil, err 521 } 522 if !journalsExist { 523 ts.Logger().Infof("No previous journals were found. Proceeding normally.") 524 sm, err := workflow.BuildStreamMigrator(ctx, ts, cancel) 525 if err != nil { 526 ts.Logger().Errorf("buildStreamMigrater failed: %v", err) 527 return 0, nil, err 528 } 529 if cancel { 530 sw.cancelMigration(ctx, sm) 531 return 0, sw.logs(), nil 532 } 533 534 ts.Logger().Infof("Stopping streams") 535 sourceWorkflows, err = sw.stopStreams(ctx, sm) 536 if err != nil { 537 ts.Logger().Errorf("stopStreams failed: %v", err) 538 for key, streams := range sm.Streams() { 539 for _, stream := range streams { 540 ts.Logger().Errorf("stream in stopStreams: key %s shard %s stream %+v", key, stream.BinlogSource.Shard, stream.BinlogSource) 541 } 542 } 543 sw.cancelMigration(ctx, sm) 544 return 0, nil, err 545 } 546 547 ts.Logger().Infof("Stopping source writes") 548 if err := sw.stopSourceWrites(ctx); err != nil { 549 ts.Logger().Errorf("stopSourceWrites failed: %v", err) 550 sw.cancelMigration(ctx, sm) 551 return 0, nil, err 552 } 553 554 if ts.MigrationType() == binlogdatapb.MigrationType_TABLES { 555 ts.Logger().Infof("Executing LOCK TABLES on source tables %d times", lockTablesCycles) 556 // Doing this twice with a pause in-between to catch any writes that may have raced in between 557 // the tablet's deny list check and the first mysqld side table lock. 558 for cnt := 1; cnt <= lockTablesCycles; cnt++ { 559 if err := ts.executeLockTablesOnSource(ctx); err != nil { 560 ts.Logger().Errorf("Failed to execute LOCK TABLES (attempt %d of %d) on sources: %v", cnt, lockTablesCycles, err) 561 sw.cancelMigration(ctx, sm) 562 return 0, nil, err 563 } 564 // No need to UNLOCK the tables as the connection was closed once the locks were acquired 565 // and thus the locks released. 566 time.Sleep(lockTablesCycleDelay) 567 } 568 } 569 570 ts.Logger().Infof("Waiting for streams to catchup") 571 if err := sw.waitForCatchup(ctx, timeout); err != nil { 572 ts.Logger().Errorf("waitForCatchup failed: %v", err) 573 sw.cancelMigration(ctx, sm) 574 return 0, nil, err 575 } 576 577 ts.Logger().Infof("Migrating streams") 578 if err := sw.migrateStreams(ctx, sm); err != nil { 579 ts.Logger().Errorf("migrateStreams failed: %v", err) 580 sw.cancelMigration(ctx, sm) 581 return 0, nil, err 582 } 583 584 ts.Logger().Infof("Creating reverse streams") 585 if err := sw.createReverseVReplication(ctx); err != nil { 586 ts.Logger().Errorf("createReverseVReplication failed: %v", err) 587 sw.cancelMigration(ctx, sm) 588 return 0, nil, err 589 } 590 } else { 591 if cancel { 592 err := fmt.Errorf("traffic switching has reached the point of no return, cannot cancel") 593 ts.Logger().Errorf("%v", err) 594 return 0, nil, err 595 } 596 ts.Logger().Infof("Journals were found. Completing the left over steps.") 597 // Need to gather positions in case all journals were not created. 598 if err := ts.gatherPositions(ctx); err != nil { 599 ts.Logger().Errorf("gatherPositions failed: %v", err) 600 return 0, nil, err 601 } 602 } 603 604 // This is the point of no return. Once a journal is created, 605 // traffic can be redirected to target shards. 606 if err := sw.createJournals(ctx, sourceWorkflows); err != nil { 607 ts.Logger().Errorf("createJournals failed: %v", err) 608 return 0, nil, err 609 } 610 if err := sw.allowTargetWrites(ctx); err != nil { 611 ts.Logger().Errorf("allowTargetWrites failed: %v", err) 612 return 0, nil, err 613 } 614 if err := sw.changeRouting(ctx); err != nil { 615 ts.Logger().Errorf("changeRouting failed: %v", err) 616 return 0, nil, err 617 } 618 if err := sw.streamMigraterfinalize(ctx, ts, sourceWorkflows); err != nil { 619 ts.Logger().Errorf("finalize failed: %v", err) 620 return 0, nil, err 621 } 622 if reverseReplication { 623 if err := sw.startReverseVReplication(ctx); err != nil { 624 ts.Logger().Errorf("startReverseVReplication failed: %v", err) 625 return 0, nil, err 626 } 627 } 628 629 if err := sw.freezeTargetVReplication(ctx); err != nil { 630 ts.Logger().Errorf("deleteTargetVReplication failed: %v", err) 631 return 0, nil, err 632 } 633 634 return ts.id, sw.logs(), nil 635 } 636 637 // DropTargets cleans up target tables, shards and denied tables if a MoveTables/Reshard is cancelled 638 func (wr *Wrangler) DropTargets(ctx context.Context, targetKeyspace, workflow string, keepData, keepRoutingRules, dryRun bool) (*[]string, error) { 639 ts, err := wr.buildTrafficSwitcher(ctx, targetKeyspace, workflow) 640 if err != nil { 641 wr.Logger().Errorf("buildTrafficSwitcher failed: %v", err) 642 return nil, err 643 } 644 ts.keepRoutingRules = keepRoutingRules 645 var sw iswitcher 646 if dryRun { 647 sw = &switcherDryRun{ts: ts, drLog: NewLogRecorder()} 648 } else { 649 sw = &switcher{ts: ts, wr: wr} 650 } 651 var tctx context.Context 652 tctx, sourceUnlock, lockErr := sw.lockKeyspace(ctx, ts.SourceKeyspaceName(), "DropTargets") 653 if lockErr != nil { 654 ts.Logger().Errorf("Source LockKeyspace failed: %v", lockErr) 655 return nil, lockErr 656 } 657 defer sourceUnlock(&err) 658 ctx = tctx 659 if ts.TargetKeyspaceName() != ts.SourceKeyspaceName() { 660 tctx, targetUnlock, lockErr := sw.lockKeyspace(ctx, ts.TargetKeyspaceName(), "DropTargets") 661 if lockErr != nil { 662 ts.Logger().Errorf("Target LockKeyspace failed: %v", lockErr) 663 return nil, lockErr 664 } 665 defer targetUnlock(&err) 666 ctx = tctx 667 } 668 if !keepData { 669 switch ts.MigrationType() { 670 case binlogdatapb.MigrationType_TABLES: 671 log.Infof("Deleting target tables") 672 if err := sw.removeTargetTables(ctx); err != nil { 673 return nil, err 674 } 675 if err := sw.dropSourceDeniedTables(ctx); err != nil { 676 return nil, err 677 } 678 case binlogdatapb.MigrationType_SHARDS: 679 log.Infof("Removing target shards") 680 if err := sw.dropTargetShards(ctx); err != nil { 681 return nil, err 682 } 683 } 684 } 685 if err := wr.dropArtifacts(ctx, keepRoutingRules, sw); err != nil { 686 return nil, err 687 } 688 if err := ts.TopoServer().RebuildSrvVSchema(ctx, nil); err != nil { 689 return nil, err 690 } 691 return sw.logs(), nil 692 } 693 694 func (wr *Wrangler) dropArtifacts(ctx context.Context, keepRoutingRules bool, sw iswitcher) error { 695 if err := sw.dropSourceReverseVReplicationStreams(ctx); err != nil { 696 return err 697 } 698 if err := sw.dropTargetVReplicationStreams(ctx); err != nil { 699 return err 700 } 701 if !keepRoutingRules { 702 if err := sw.deleteRoutingRules(ctx); err != nil { 703 return err 704 } 705 if err := sw.deleteShardRoutingRules(ctx); err != nil { 706 return err 707 } 708 } 709 710 return nil 711 } 712 713 // finalizeMigrateWorkflow deletes the streams for the Migrate workflow. 714 // We only cleanup the target for external sources 715 func (wr *Wrangler) finalizeMigrateWorkflow(ctx context.Context, targetKeyspace, workflow, tableSpecs string, 716 cancel, keepData, keepRoutingRules, dryRun bool) (*[]string, error) { 717 ts, err := wr.buildTrafficSwitcher(ctx, targetKeyspace, workflow) 718 if err != nil { 719 wr.Logger().Errorf("buildTrafficSwitcher failed: %v", err) 720 return nil, err 721 } 722 var sw iswitcher 723 if dryRun { 724 sw = &switcherDryRun{ts: ts, drLog: NewLogRecorder()} 725 } else { 726 sw = &switcher{ts: ts, wr: wr} 727 } 728 var tctx context.Context 729 tctx, targetUnlock, lockErr := sw.lockKeyspace(ctx, ts.TargetKeyspaceName(), "completeMigrateWorkflow") 730 if lockErr != nil { 731 ts.Logger().Errorf("Target LockKeyspace failed: %v", lockErr) 732 return nil, lockErr 733 } 734 defer targetUnlock(&err) 735 ctx = tctx 736 if err := sw.dropTargetVReplicationStreams(ctx); err != nil { 737 return nil, err 738 } 739 if !cancel { 740 sw.addParticipatingTablesToKeyspace(ctx, targetKeyspace, tableSpecs) 741 if err := ts.TopoServer().RebuildSrvVSchema(ctx, nil); err != nil { 742 return nil, err 743 } 744 } 745 log.Infof("cancel is %t, keepData %t", cancel, keepData) 746 if cancel && !keepData { 747 if err := sw.removeTargetTables(ctx); err != nil { 748 return nil, err 749 } 750 } 751 return sw.logs(), nil 752 } 753 754 // DropSources cleans up source tables, shards and denied tables after a MoveTables/Reshard is completed 755 func (wr *Wrangler) DropSources(ctx context.Context, targetKeyspace, workflowName string, removalType workflow.TableRemovalType, keepData, keepRoutingRules, force, dryRun bool) (*[]string, error) { 756 ts, err := wr.buildTrafficSwitcher(ctx, targetKeyspace, workflowName) 757 if err != nil { 758 wr.Logger().Errorf("buildTrafficSwitcher failed: %v", err) 759 return nil, err 760 } 761 var sw iswitcher 762 if dryRun { 763 sw = &switcherDryRun{ts: ts, drLog: NewLogRecorder()} 764 } else { 765 sw = &switcher{ts: ts, wr: wr} 766 } 767 var tctx context.Context 768 tctx, sourceUnlock, lockErr := sw.lockKeyspace(ctx, ts.SourceKeyspaceName(), "DropSources") 769 if lockErr != nil { 770 ts.Logger().Errorf("Source LockKeyspace failed: %v", lockErr) 771 return nil, lockErr 772 } 773 defer sourceUnlock(&err) 774 ctx = tctx 775 if ts.TargetKeyspaceName() != ts.SourceKeyspaceName() { 776 tctx, targetUnlock, lockErr := sw.lockKeyspace(ctx, ts.TargetKeyspaceName(), "DropSources") 777 if lockErr != nil { 778 ts.Logger().Errorf("Target LockKeyspace failed: %v", lockErr) 779 return nil, lockErr 780 } 781 defer targetUnlock(&err) 782 ctx = tctx 783 } 784 if !force { 785 if err := sw.validateWorkflowHasCompleted(ctx); err != nil { 786 wr.Logger().Errorf("Workflow has not completed, cannot DropSources: %v", err) 787 return nil, err 788 } 789 } 790 if !keepData { 791 switch ts.MigrationType() { 792 case binlogdatapb.MigrationType_TABLES: 793 log.Infof("Deleting tables") 794 if err := sw.removeSourceTables(ctx, removalType); err != nil { 795 return nil, err 796 } 797 if err := sw.dropSourceDeniedTables(ctx); err != nil { 798 return nil, err 799 } 800 801 case binlogdatapb.MigrationType_SHARDS: 802 log.Infof("Removing shards") 803 if err := sw.dropSourceShards(ctx); err != nil { 804 return nil, err 805 } 806 } 807 } 808 if err := wr.dropArtifacts(ctx, keepRoutingRules, sw); err != nil { 809 return nil, err 810 } 811 if err := ts.TopoServer().RebuildSrvVSchema(ctx, nil); err != nil { 812 return nil, err 813 } 814 815 return sw.logs(), nil 816 } 817 818 func (wr *Wrangler) buildTrafficSwitcher(ctx context.Context, targetKeyspace, workflowName string) (*trafficSwitcher, error) { 819 tgtInfo, err := workflow.BuildTargets(ctx, wr.ts, wr.tmc, targetKeyspace, workflowName) 820 if err != nil { 821 log.Infof("Error building targets: %s", err) 822 return nil, err 823 } 824 targets, frozen, optCells, optTabletTypes := tgtInfo.Targets, tgtInfo.Frozen, tgtInfo.OptCells, tgtInfo.OptTabletTypes 825 826 ts := &trafficSwitcher{ 827 wr: wr, 828 workflow: workflowName, 829 reverseWorkflow: workflow.ReverseWorkflowName(workflowName), 830 id: workflow.HashStreams(targetKeyspace, targets), 831 targets: targets, 832 sources: make(map[string]*workflow.MigrationSource), 833 targetKeyspace: targetKeyspace, 834 frozen: frozen, 835 optCells: optCells, 836 optTabletTypes: optTabletTypes, 837 workflowType: tgtInfo.WorkflowType, 838 workflowSubType: tgtInfo.WorkflowSubType, 839 } 840 log.Infof("Migration ID for workflow %s: %d", workflowName, ts.id) 841 sourceTopo := wr.ts 842 843 // Build the sources 844 for _, target := range targets { 845 for _, bls := range target.Sources { 846 if ts.sourceKeyspace == "" { 847 ts.sourceKeyspace = bls.Keyspace 848 ts.sourceTimeZone = bls.SourceTimeZone 849 ts.targetTimeZone = bls.TargetTimeZone 850 ts.externalCluster = bls.ExternalCluster 851 if ts.externalCluster != "" { 852 externalTopo, err := wr.ts.OpenExternalVitessClusterServer(ctx, ts.externalCluster) 853 if err != nil { 854 return nil, err 855 } 856 sourceTopo = externalTopo 857 ts.externalTopo = externalTopo 858 } 859 } else if ts.sourceKeyspace != bls.Keyspace { 860 return nil, fmt.Errorf("source keyspaces are mismatched across streams: %v vs %v", ts.sourceKeyspace, bls.Keyspace) 861 } 862 863 if ts.tables == nil { 864 for _, rule := range bls.Filter.Rules { 865 ts.tables = append(ts.tables, rule.Match) 866 } 867 sort.Strings(ts.tables) 868 } else { 869 var tables []string 870 for _, rule := range bls.Filter.Rules { 871 tables = append(tables, rule.Match) 872 } 873 sort.Strings(tables) 874 if !reflect.DeepEqual(ts.tables, tables) { 875 return nil, fmt.Errorf("table lists are mismatched across streams: %v vs %v", ts.tables, tables) 876 } 877 } 878 879 if _, ok := ts.sources[bls.Shard]; ok { 880 continue 881 } 882 sourcesi, err := sourceTopo.GetShard(ctx, bls.Keyspace, bls.Shard) 883 if err != nil { 884 return nil, err 885 } 886 sourcePrimary, err := sourceTopo.GetTablet(ctx, sourcesi.PrimaryAlias) 887 if err != nil { 888 return nil, err 889 } 890 ts.sources[bls.Shard] = workflow.NewMigrationSource(sourcesi, sourcePrimary) 891 } 892 } 893 if ts.sourceKeyspace != ts.targetKeyspace || ts.externalCluster != "" { 894 ts.migrationType = binlogdatapb.MigrationType_TABLES 895 } else { 896 // TODO(sougou): for shard migration, validate that source and target combined 897 // keyranges match. 898 ts.migrationType = binlogdatapb.MigrationType_SHARDS 899 for sourceShard := range ts.sources { 900 if _, ok := ts.targets[sourceShard]; ok { 901 // If shards are overlapping, then this is a table migration. 902 ts.migrationType = binlogdatapb.MigrationType_TABLES 903 break 904 } 905 } 906 } 907 vs, err := sourceTopo.GetVSchema(ctx, ts.sourceKeyspace) 908 if err != nil { 909 return nil, err 910 } 911 ts.sourceKSSchema, err = vindexes.BuildKeyspaceSchema(vs, ts.sourceKeyspace) 912 if err != nil { 913 return nil, err 914 } 915 916 sourceShards, targetShards := ts.getSourceAndTargetShardsNames() 917 918 ts.isPartialMigration, err = ts.isPartialMoveTables(sourceShards, targetShards) 919 if err != nil { 920 return nil, err 921 } 922 if ts.isPartialMigration { 923 log.Infof("Migration is partial, for shards %+v", sourceShards) 924 } 925 return ts, nil 926 } 927 928 func (ts *trafficSwitcher) getSourceAndTargetShardsNames() ([]string, []string) { 929 var sourceShards, targetShards []string 930 for _, si := range ts.SourceShards() { 931 sourceShards = append(sourceShards, si.ShardName()) 932 } 933 for _, si := range ts.TargetShards() { 934 targetShards = append(targetShards, si.ShardName()) 935 } 936 return sourceShards, targetShards 937 } 938 939 // isPartialMoveTables returns true if whe workflow is MoveTables, 940 // has the same number of shards, is not covering the entire shard range, and has one-to-one shards in source and target 941 func (ts *trafficSwitcher) isPartialMoveTables(sourceShards, targetShards []string) (bool, error) { 942 943 if ts.MigrationType() != binlogdatapb.MigrationType_TABLES { 944 return false, nil 945 } 946 947 skr, tkr, err := getSourceAndTargetKeyRanges(sourceShards, targetShards) 948 if err != nil { 949 return false, err 950 } 951 952 if !key.KeyRangeIsPartial(skr) || !key.KeyRangeIsPartial(tkr) || // both cover full range 953 len(sourceShards) != len(targetShards) { 954 955 return false, nil 956 } 957 958 return key.KeyRangeEqual(skr, tkr), nil 959 } 960 961 func getSourceAndTargetKeyRanges(sourceShards, targetShards []string) (*topodatapb.KeyRange, *topodatapb.KeyRange, error) { 962 if len(sourceShards) == 0 || len(targetShards) == 0 { 963 return nil, nil, fmt.Errorf("either source or target shards are missing") 964 } 965 966 getKeyRange := func(shard string) (*topodatapb.KeyRange, error) { 967 krs, err := key.ParseShardingSpec(shard) 968 if err != nil { 969 return nil, err 970 } 971 return krs[0], nil 972 } 973 974 // happily string sorting of shards also sorts them in the ascending order of key ranges in vitess 975 sort.Strings(sourceShards) 976 sort.Strings(targetShards) 977 getFullKeyRange := func(shards []string) (*topodatapb.KeyRange, error) { 978 // expect sorted shards 979 kr1, err := getKeyRange(sourceShards[0]) 980 if err != nil { 981 return nil, err 982 } 983 kr2, err := getKeyRange(sourceShards[len(sourceShards)-1]) 984 if err != nil { 985 return nil, err 986 } 987 return &topodatapb.KeyRange{ 988 Start: kr1.Start, 989 End: kr2.End, 990 }, nil 991 } 992 993 skr, err := getFullKeyRange(sourceShards) 994 if err != nil { 995 return nil, nil, err 996 } 997 tkr, err := getFullKeyRange(targetShards) 998 if err != nil { 999 return nil, nil, err 1000 } 1001 1002 return skr, tkr, nil 1003 } 1004 1005 func (ts *trafficSwitcher) validate(ctx context.Context) error { 1006 if ts.MigrationType() == binlogdatapb.MigrationType_TABLES { 1007 if ts.isPartialMigration { 1008 return nil 1009 } 1010 sourceTopo := ts.wr.ts 1011 if ts.externalTopo != nil { 1012 sourceTopo = ts.externalTopo 1013 } 1014 1015 // All shards must be present. 1016 if err := workflow.CompareShards(ctx, ts.SourceKeyspaceName(), ts.SourceShards(), sourceTopo); err != nil { 1017 return err 1018 } 1019 if err := workflow.CompareShards(ctx, ts.TargetKeyspaceName(), ts.TargetShards(), ts.wr.ts); err != nil { 1020 return err 1021 } 1022 // Wildcard table names not allowed. 1023 for _, table := range ts.tables { 1024 if strings.HasPrefix(table, "/") { 1025 return fmt.Errorf("cannot migrate streams with wild card table names: %v", table) 1026 } 1027 } 1028 } 1029 return nil 1030 } 1031 1032 func (ts *trafficSwitcher) switchTableReads(ctx context.Context, cells []string, servedTypes []topodatapb.TabletType, direction workflow.TrafficSwitchDirection) error { 1033 log.Infof("switchTableReads: servedTypes: %+v, direction %t", servedTypes, direction) 1034 rules, err := topotools.GetRoutingRules(ctx, ts.TopoServer()) 1035 if err != nil { 1036 return err 1037 } 1038 // We assume that the following rules were setup when the targets were created: 1039 // table -> sourceKeyspace.table 1040 // targetKeyspace.table -> sourceKeyspace.table 1041 // For forward migration, we add tablet type specific rules to redirect traffic to the target. 1042 // For backward, we redirect to source 1043 for _, servedType := range servedTypes { 1044 tt := strings.ToLower(servedType.String()) 1045 for _, table := range ts.Tables() { 1046 if direction == workflow.DirectionForward { 1047 log.Infof("Route direction forward") 1048 toTarget := []string{ts.TargetKeyspaceName() + "." + table} 1049 rules[table+"@"+tt] = toTarget 1050 rules[ts.TargetKeyspaceName()+"."+table+"@"+tt] = toTarget 1051 rules[ts.SourceKeyspaceName()+"."+table+"@"+tt] = toTarget 1052 } else { 1053 log.Infof("Route direction backwards") 1054 toSource := []string{ts.SourceKeyspaceName() + "." + table} 1055 rules[table+"@"+tt] = toSource 1056 rules[ts.TargetKeyspaceName()+"."+table+"@"+tt] = toSource 1057 rules[ts.SourceKeyspaceName()+"."+table+"@"+tt] = toSource 1058 } 1059 } 1060 } 1061 if err := topotools.SaveRoutingRules(ctx, ts.TopoServer(), rules); err != nil { 1062 return err 1063 } 1064 return ts.TopoServer().RebuildSrvVSchema(ctx, cells) 1065 } 1066 1067 func (ts *trafficSwitcher) switchShardReads(ctx context.Context, cells []string, servedTypes []topodatapb.TabletType, direction workflow.TrafficSwitchDirection) error { 1068 var fromShards, toShards []*topo.ShardInfo 1069 if direction == workflow.DirectionForward { 1070 fromShards, toShards = ts.SourceShards(), ts.TargetShards() 1071 } else { 1072 fromShards, toShards = ts.TargetShards(), ts.SourceShards() 1073 } 1074 if err := ts.TopoServer().ValidateSrvKeyspace(ctx, ts.TargetKeyspaceName(), strings.Join(cells, ",")); err != nil { 1075 err2 := vterrors.Wrapf(err, "Before switching shard reads, found SrvKeyspace for %s is corrupt in cell %s", 1076 ts.TargetKeyspaceName(), strings.Join(cells, ",")) 1077 log.Errorf("%w", err2) 1078 return err2 1079 } 1080 for _, servedType := range servedTypes { 1081 if err := ts.wr.updateShardRecords(ctx, ts.SourceKeyspaceName(), fromShards, cells, servedType, true /* isFrom */, false /* clearSourceShards */); err != nil { 1082 return err 1083 } 1084 if err := ts.wr.updateShardRecords(ctx, ts.SourceKeyspaceName(), toShards, cells, servedType, false, false); err != nil { 1085 return err 1086 } 1087 err := ts.TopoServer().MigrateServedType(ctx, ts.SourceKeyspaceName(), toShards, fromShards, servedType, cells) 1088 if err != nil { 1089 return err 1090 } 1091 } 1092 if err := ts.TopoServer().ValidateSrvKeyspace(ctx, ts.TargetKeyspaceName(), strings.Join(cells, ",")); err != nil { 1093 err2 := vterrors.Wrapf(err, "After switching shard reads, found SrvKeyspace for %s is corrupt in cell %s", 1094 ts.TargetKeyspaceName(), strings.Join(cells, ",")) 1095 log.Errorf("%w", err2) 1096 return err2 1097 } 1098 return nil 1099 } 1100 1101 // checkJournals returns true if at least one journal has been created. 1102 // If so, it also returns the list of sourceWorkflows that need to be switched. 1103 func (ts *trafficSwitcher) checkJournals(ctx context.Context) (journalsExist bool, sourceWorkflows []string, err error) { 1104 var ( 1105 ws = workflow.NewServer(ts.TopoServer(), ts.TabletManagerClient()) 1106 mu sync.Mutex 1107 ) 1108 1109 err = ts.ForAllSources(func(source *workflow.MigrationSource) error { 1110 mu.Lock() 1111 defer mu.Unlock() 1112 journal, exists, err := ws.CheckReshardingJournalExistsOnTablet(ctx, source.GetPrimary().Tablet, ts.id) 1113 if err != nil { 1114 return err 1115 } 1116 if exists { 1117 if journal.Id != 0 { 1118 sourceWorkflows = journal.SourceWorkflows 1119 } 1120 source.Journaled = true 1121 journalsExist = true 1122 } 1123 return nil 1124 }) 1125 return journalsExist, sourceWorkflows, err 1126 } 1127 1128 func (ts *trafficSwitcher) stopSourceWrites(ctx context.Context) error { 1129 var err error 1130 if ts.MigrationType() == binlogdatapb.MigrationType_TABLES { 1131 err = ts.changeTableSourceWrites(ctx, disallowWrites) 1132 } else { 1133 err = ts.changeShardsAccess(ctx, ts.SourceKeyspaceName(), ts.SourceShards(), disallowWrites) 1134 } 1135 if err != nil { 1136 log.Warningf("Error: %s", err) 1137 return err 1138 } 1139 return ts.ForAllSources(func(source *workflow.MigrationSource) error { 1140 var err error 1141 source.Position, err = ts.TabletManagerClient().PrimaryPosition(ctx, source.GetPrimary().Tablet) 1142 ts.wr.Logger().Infof("Stopped Source Writes. Position for source %v:%v: %v", 1143 ts.SourceKeyspaceName(), source.GetShard().ShardName(), source.Position) 1144 if err != nil { 1145 log.Warningf("Error: %s", err) 1146 } 1147 return err 1148 }) 1149 } 1150 1151 func (ts *trafficSwitcher) changeTableSourceWrites(ctx context.Context, access accessType) error { 1152 return ts.ForAllSources(func(source *workflow.MigrationSource) error { 1153 if _, err := ts.TopoServer().UpdateShardFields(ctx, ts.SourceKeyspaceName(), source.GetShard().ShardName(), func(si *topo.ShardInfo) error { 1154 return si.UpdateSourceDeniedTables(ctx, topodatapb.TabletType_PRIMARY, nil, access == allowWrites /* remove */, ts.Tables()) 1155 }); err != nil { 1156 return err 1157 } 1158 rtbsCtx, cancel := context.WithTimeout(ctx, shardTabletRefreshTimeout) 1159 defer cancel() 1160 isPartial, partialDetails, err := topotools.RefreshTabletsByShard(rtbsCtx, ts.TopoServer(), ts.TabletManagerClient(), source.GetShard(), nil, ts.Logger()) 1161 if isPartial { 1162 err = fmt.Errorf("failed to successfully refresh all tablets in the %s/%s source shard (%v):\n %v", 1163 source.GetShard().Keyspace(), source.GetShard().ShardName(), err, partialDetails) 1164 } 1165 return err 1166 }) 1167 } 1168 1169 // executeLockTablesOnSource executes a LOCK TABLES tb1 READ, tbl2 READ,... statement on each 1170 // source shard's primary tablet using a non-pooled connection as the DBA user. The connection 1171 // is closed when the LOCK TABLES statement returns, so we immediately release the LOCKs. 1172 func (ts *trafficSwitcher) executeLockTablesOnSource(ctx context.Context) error { 1173 ts.Logger().Infof("Locking (and then immediately unlocking) the following tables on source keyspace %v: %v", ts.SourceKeyspaceName(), ts.Tables()) 1174 if len(ts.Tables()) == 0 { 1175 return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "no tables found in the source keyspace %v associated with the %s workflow", ts.SourceKeyspaceName(), ts.WorkflowName()) 1176 } 1177 1178 sb := strings.Builder{} 1179 sb.WriteString("LOCK TABLES ") 1180 for _, tableName := range ts.Tables() { 1181 sb.WriteString(fmt.Sprintf("%s READ,", sqlescape.EscapeID(tableName))) 1182 } 1183 // trim extra trailing comma 1184 lockStmt := sb.String()[:sb.Len()-1] 1185 1186 return ts.ForAllSources(func(source *workflow.MigrationSource) error { 1187 primary := source.GetPrimary() 1188 if primary == nil { 1189 return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "no primary found for source shard %s", source.GetShard()) 1190 } 1191 tablet := primary.Tablet 1192 _, err := ts.wr.ExecuteFetchAsDba(ctx, tablet.Alias, lockStmt, 1, false, true) 1193 if err != nil { 1194 ts.Logger().Errorf("Error executing %s on source tablet %v: %v", lockStmt, tablet, err) 1195 return err 1196 } 1197 return err 1198 }) 1199 } 1200 1201 func (ts *trafficSwitcher) waitForCatchup(ctx context.Context, filteredReplicationWaitTime time.Duration) error { 1202 ctx, cancel := context.WithTimeout(ctx, filteredReplicationWaitTime) 1203 defer cancel() 1204 // source writes have been stopped, wait for all streams on targets to catch up 1205 if err := ts.ForAllUIDs(func(target *workflow.MigrationTarget, uid uint32) error { 1206 ts.Logger().Infof("Before Catchup: uid: %d, target primary %s, target position %s, shard %s", uid, 1207 target.GetPrimary().AliasString(), target.Position, target.GetShard().String()) 1208 bls := target.Sources[uid] 1209 source := ts.Sources()[bls.Shard] 1210 ts.Logger().Infof("Before Catchup: waiting for keyspace:shard: %v:%v to reach source position %v, uid %d", 1211 ts.TargetKeyspaceName(), target.GetShard().ShardName(), source.Position, uid) 1212 if err := ts.TabletManagerClient().VReplicationWaitForPos(ctx, target.GetPrimary().Tablet, int(uid), source.Position); err != nil { 1213 return err 1214 } 1215 log.Infof("After catchup: target keyspace:shard: %v:%v, source position %v, uid %d", 1216 ts.TargetKeyspaceName(), target.GetShard().ShardName(), source.Position, uid) 1217 ts.Logger().Infof("After catchup: position for keyspace:shard: %v:%v reached, uid %d", 1218 ts.TargetKeyspaceName(), target.GetShard().ShardName(), uid) 1219 if _, err := ts.TabletManagerClient().VReplicationExec(ctx, target.GetPrimary().Tablet, binlogplayer.StopVReplication(uid, "stopped for cutover")); err != nil { 1220 log.Infof("error marking stopped for cutover on %s, uid %d", target.GetPrimary().AliasString(), uid) 1221 return err 1222 } 1223 return nil 1224 }); err != nil { 1225 return err 1226 } 1227 // all targets have caught up, record their positions for setting up reverse workflows 1228 return ts.ForAllTargets(func(target *workflow.MigrationTarget) error { 1229 var err error 1230 target.Position, err = ts.TabletManagerClient().PrimaryPosition(ctx, target.GetPrimary().Tablet) 1231 ts.Logger().Infof("After catchup, position for target primary %s, %v", target.GetPrimary().AliasString(), target.Position) 1232 return err 1233 }) 1234 } 1235 1236 func (ts *trafficSwitcher) cancelMigration(ctx context.Context, sm *workflow.StreamMigrator) { 1237 var err error 1238 if ts.MigrationType() == binlogdatapb.MigrationType_TABLES { 1239 err = ts.changeTableSourceWrites(ctx, allowWrites) 1240 } else { 1241 err = ts.changeShardsAccess(ctx, ts.SourceKeyspaceName(), ts.SourceShards(), allowWrites) 1242 } 1243 if err != nil { 1244 ts.Logger().Errorf("Cancel migration failed:", err) 1245 } 1246 1247 sm.CancelMigration(ctx) 1248 1249 err = ts.ForAllTargets(func(target *workflow.MigrationTarget) error { 1250 query := fmt.Sprintf("update _vt.vreplication set state='Running', message='' where db_name=%s and workflow=%s", encodeString(target.GetPrimary().DbName()), encodeString(ts.WorkflowName())) 1251 _, err := ts.TabletManagerClient().VReplicationExec(ctx, target.GetPrimary().Tablet, query) 1252 return err 1253 }) 1254 if err != nil { 1255 ts.Logger().Errorf("Cancel migration failed: could not restart vreplication: %v", err) 1256 } 1257 1258 err = ts.deleteReverseVReplication(ctx) 1259 if err != nil { 1260 ts.Logger().Errorf("Cancel migration failed: could not delete revers vreplication entries: %v", err) 1261 } 1262 } 1263 1264 func (ts *trafficSwitcher) gatherPositions(ctx context.Context) error { 1265 err := ts.ForAllSources(func(source *workflow.MigrationSource) error { 1266 var err error 1267 source.Position, err = ts.TabletManagerClient().PrimaryPosition(ctx, source.GetPrimary().Tablet) 1268 ts.Logger().Infof("Position for source %v:%v: %v", ts.SourceKeyspaceName(), source.GetShard().ShardName(), source.Position) 1269 return err 1270 }) 1271 if err != nil { 1272 return err 1273 } 1274 return ts.ForAllTargets(func(target *workflow.MigrationTarget) error { 1275 var err error 1276 target.Position, err = ts.TabletManagerClient().PrimaryPosition(ctx, target.GetPrimary().Tablet) 1277 ts.Logger().Infof("Position for target %v:%v: %v", ts.TargetKeyspaceName(), target.GetShard().ShardName(), target.Position) 1278 return err 1279 }) 1280 } 1281 1282 func (ts *trafficSwitcher) createReverseVReplication(ctx context.Context) error { 1283 if err := ts.deleteReverseVReplication(ctx); err != nil { 1284 return err 1285 } 1286 err := ts.ForAllUIDs(func(target *workflow.MigrationTarget, uid uint32) error { 1287 bls := target.Sources[uid] 1288 source := ts.Sources()[bls.Shard] 1289 reverseBls := &binlogdatapb.BinlogSource{ 1290 Keyspace: ts.TargetKeyspaceName(), 1291 Shard: target.GetShard().ShardName(), 1292 TabletType: bls.TabletType, 1293 Filter: &binlogdatapb.Filter{}, 1294 OnDdl: bls.OnDdl, 1295 SourceTimeZone: bls.TargetTimeZone, 1296 TargetTimeZone: bls.SourceTimeZone, 1297 } 1298 1299 for _, rule := range bls.Filter.Rules { 1300 if rule.Filter == "exclude" { 1301 reverseBls.Filter.Rules = append(reverseBls.Filter.Rules, rule) 1302 continue 1303 } 1304 var filter string 1305 if strings.HasPrefix(rule.Match, "/") { 1306 if ts.SourceKeyspaceSchema().Keyspace.Sharded { 1307 filter = key.KeyRangeString(source.GetShard().KeyRange) 1308 } 1309 } else { 1310 var inKeyrange string 1311 if ts.SourceKeyspaceSchema().Keyspace.Sharded { 1312 vtable, ok := ts.SourceKeyspaceSchema().Tables[rule.Match] 1313 if !ok { 1314 return fmt.Errorf("table %s not found in vschema1", rule.Match) 1315 } 1316 // TODO(sougou): handle degenerate cases like sequence, etc. 1317 // We currently assume the primary vindex is the best way to filter, which may not be true. 1318 inKeyrange = fmt.Sprintf(" where in_keyrange(%s, '%s.%s', '%s')", sqlparser.String(vtable.ColumnVindexes[0].Columns[0]), ts.SourceKeyspaceName(), vtable.ColumnVindexes[0].Name, key.KeyRangeString(source.GetShard().KeyRange)) 1319 } 1320 filter = fmt.Sprintf("select * from %s%s", sqlescape.EscapeID(rule.Match), inKeyrange) 1321 } 1322 reverseBls.Filter.Rules = append(reverseBls.Filter.Rules, &binlogdatapb.Rule{ 1323 Match: rule.Match, 1324 Filter: filter, 1325 }) 1326 } 1327 log.Infof("Creating reverse workflow vreplication stream on tablet %s: workflow %s, startPos %s", 1328 source.GetPrimary().Alias, ts.ReverseWorkflowName(), target.Position) 1329 _, err := ts.VReplicationExec(ctx, source.GetPrimary().Alias, 1330 binlogplayer.CreateVReplicationState(ts.ReverseWorkflowName(), reverseBls, target.Position, 1331 binlogplayer.BlpStopped, source.GetPrimary().DbName(), ts.workflowType, ts.workflowSubType)) 1332 if err != nil { 1333 return err 1334 } 1335 1336 // if user has defined the cell/tablet_types parameters in the forward workflow, update the reverse workflow as well 1337 updateQuery := ts.getReverseVReplicationUpdateQuery(target.GetPrimary().Alias.Cell, source.GetPrimary().Alias.Cell, source.GetPrimary().DbName()) 1338 if updateQuery != "" { 1339 log.Infof("Updating vreplication stream entry on %s with: %s", source.GetPrimary().Alias, updateQuery) 1340 _, err = ts.VReplicationExec(ctx, source.GetPrimary().Alias, updateQuery) 1341 return err 1342 } 1343 return nil 1344 }) 1345 return err 1346 } 1347 1348 func (ts *trafficSwitcher) getReverseVReplicationUpdateQuery(targetCell string, sourceCell string, dbname string) string { 1349 // we try to be clever to understand what user intends: 1350 // if target's cell is present in cells but not source's cell we replace it with the source's cell 1351 if ts.optCells != "" && targetCell != sourceCell && strings.Contains(ts.optCells+",", targetCell+",") && 1352 !strings.Contains(ts.optCells+",", sourceCell+",") { 1353 ts.optCells = strings.Replace(ts.optCells, targetCell, sourceCell, 1) 1354 } 1355 1356 if ts.optCells != "" || ts.optTabletTypes != "" { 1357 query := fmt.Sprintf("update _vt.vreplication set cell = '%s', tablet_types = '%s' where workflow = '%s' and db_name = '%s'", 1358 ts.optCells, ts.optTabletTypes, ts.ReverseWorkflowName(), dbname) 1359 return query 1360 } 1361 return "" 1362 } 1363 1364 func (ts *trafficSwitcher) deleteReverseVReplication(ctx context.Context) error { 1365 return ts.ForAllSources(func(source *workflow.MigrationSource) error { 1366 query := fmt.Sprintf(sqlDeleteWorkflow, encodeString(source.GetPrimary().DbName()), encodeString(ts.reverseWorkflow)) 1367 if _, err := ts.TabletManagerClient().VReplicationExec(ctx, source.GetPrimary().Tablet, query); err != nil { 1368 return err 1369 } 1370 ts.wr.deleteWorkflowVDiffData(ctx, source.GetPrimary().Tablet, ts.reverseWorkflow) 1371 ts.wr.optimizeCopyStateTable(source.GetPrimary().Tablet) 1372 return nil 1373 }) 1374 } 1375 1376 func (ts *trafficSwitcher) createJournals(ctx context.Context, sourceWorkflows []string) error { 1377 log.Infof("In createJournals for source workflows %+v", sourceWorkflows) 1378 return ts.ForAllSources(func(source *workflow.MigrationSource) error { 1379 if source.Journaled { 1380 return nil 1381 } 1382 participants := make([]*binlogdatapb.KeyspaceShard, 0) 1383 participantMap := make(map[string]bool) 1384 journal := &binlogdatapb.Journal{ 1385 Id: ts.id, 1386 MigrationType: ts.MigrationType(), 1387 Tables: ts.Tables(), 1388 LocalPosition: source.Position, 1389 Participants: participants, 1390 SourceWorkflows: sourceWorkflows, 1391 } 1392 for targetShard, target := range ts.Targets() { 1393 for _, tsource := range target.Sources { 1394 participantMap[tsource.Shard] = true 1395 } 1396 journal.ShardGtids = append(journal.ShardGtids, &binlogdatapb.ShardGtid{ 1397 Keyspace: ts.TargetKeyspaceName(), 1398 Shard: targetShard, 1399 Gtid: target.Position, 1400 }) 1401 } 1402 shards := make([]string, 0) 1403 for shard := range participantMap { 1404 shards = append(shards, shard) 1405 } 1406 sort.Sort(vreplication.ShardSorter(shards)) 1407 for _, shard := range shards { 1408 journal.Participants = append(journal.Participants, &binlogdatapb.KeyspaceShard{ 1409 Keyspace: source.GetShard().Keyspace(), 1410 Shard: shard, 1411 }) 1412 1413 } 1414 log.Infof("Creating journal %v", journal) 1415 ts.Logger().Infof("Creating journal: %v", journal) 1416 statement := fmt.Sprintf("insert into _vt.resharding_journal "+ 1417 "(id, db_name, val) "+ 1418 "values (%v, %v, %v)", 1419 ts.id, encodeString(source.GetPrimary().DbName()), encodeString(journal.String())) 1420 if _, err := ts.TabletManagerClient().VReplicationExec(ctx, source.GetPrimary().Tablet, statement); err != nil { 1421 return err 1422 } 1423 return nil 1424 }) 1425 } 1426 1427 func (ts *trafficSwitcher) allowTargetWrites(ctx context.Context) error { 1428 if ts.MigrationType() == binlogdatapb.MigrationType_TABLES { 1429 return ts.allowTableTargetWrites(ctx) 1430 } 1431 return ts.changeShardsAccess(ctx, ts.TargetKeyspaceName(), ts.TargetShards(), allowWrites) 1432 } 1433 1434 func (ts *trafficSwitcher) allowTableTargetWrites(ctx context.Context) error { 1435 return ts.ForAllTargets(func(target *workflow.MigrationTarget) error { 1436 if _, err := ts.TopoServer().UpdateShardFields(ctx, ts.TargetKeyspaceName(), target.GetShard().ShardName(), func(si *topo.ShardInfo) error { 1437 return si.UpdateSourceDeniedTables(ctx, topodatapb.TabletType_PRIMARY, nil, true, ts.Tables()) 1438 }); err != nil { 1439 return err 1440 } 1441 rtbsCtx, cancel := context.WithTimeout(ctx, shardTabletRefreshTimeout) 1442 defer cancel() 1443 _, _, err := topotools.RefreshTabletsByShard(rtbsCtx, ts.TopoServer(), ts.TabletManagerClient(), target.GetShard(), nil, ts.Logger()) 1444 return err 1445 }) 1446 } 1447 1448 func (ts *trafficSwitcher) changeRouting(ctx context.Context) error { 1449 if ts.MigrationType() == binlogdatapb.MigrationType_TABLES { 1450 return ts.changeWriteRoute(ctx) 1451 } 1452 return ts.changeShardRouting(ctx) 1453 } 1454 1455 func (ts *trafficSwitcher) changeWriteRoute(ctx context.Context) error { 1456 if ts.isPartialMigration { 1457 srr, err := topotools.GetShardRoutingRules(ctx, ts.TopoServer()) 1458 if err != nil { 1459 return err 1460 } 1461 for _, si := range ts.SourceShards() { 1462 delete(srr, fmt.Sprintf("%s.%s", ts.TargetKeyspaceName(), si.ShardName())) 1463 ts.Logger().Infof("Deleted shard routing: %v:%v", ts.TargetKeyspaceName(), si.ShardName()) 1464 srr[fmt.Sprintf("%s.%s", ts.SourceKeyspaceName(), si.ShardName())] = ts.TargetKeyspaceName() 1465 ts.Logger().Infof("Added shard routing: %v:%v", ts.SourceKeyspaceName(), si.ShardName()) 1466 } 1467 if err := topotools.SaveShardRoutingRules(ctx, ts.TopoServer(), srr); err != nil { 1468 return err 1469 } 1470 } else { 1471 rules, err := topotools.GetRoutingRules(ctx, ts.TopoServer()) 1472 if err != nil { 1473 return err 1474 } 1475 for _, table := range ts.Tables() { 1476 targetKsTable := fmt.Sprintf("%s.%s", ts.TargetKeyspaceName(), table) 1477 sourceKsTable := fmt.Sprintf("%s.%s", ts.SourceKeyspaceName(), table) 1478 delete(rules, targetKsTable) 1479 ts.Logger().Infof("Deleted routing: %s", targetKsTable) 1480 rules[table] = []string{targetKsTable} 1481 rules[sourceKsTable] = []string{targetKsTable} 1482 ts.Logger().Infof("Added routing: %v %v", table, sourceKsTable) 1483 } 1484 if err := topotools.SaveRoutingRules(ctx, ts.TopoServer(), rules); err != nil { 1485 return err 1486 } 1487 } 1488 1489 return ts.TopoServer().RebuildSrvVSchema(ctx, nil) 1490 } 1491 1492 func (ts *trafficSwitcher) changeShardRouting(ctx context.Context) error { 1493 if err := ts.TopoServer().ValidateSrvKeyspace(ctx, ts.TargetKeyspaceName(), ""); err != nil { 1494 err2 := vterrors.Wrapf(err, "Before changing shard routes, found SrvKeyspace for %s is corrupt", ts.TargetKeyspaceName()) 1495 log.Errorf("%w", err2) 1496 return err2 1497 } 1498 err := ts.ForAllSources(func(source *workflow.MigrationSource) error { 1499 _, err := ts.TopoServer().UpdateShardFields(ctx, ts.SourceKeyspaceName(), source.GetShard().ShardName(), func(si *topo.ShardInfo) error { 1500 si.IsPrimaryServing = false 1501 return nil 1502 }) 1503 return err 1504 }) 1505 if err != nil { 1506 return err 1507 } 1508 err = ts.ForAllTargets(func(target *workflow.MigrationTarget) error { 1509 _, err := ts.TopoServer().UpdateShardFields(ctx, ts.TargetKeyspaceName(), target.GetShard().ShardName(), func(si *topo.ShardInfo) error { 1510 si.IsPrimaryServing = true 1511 return nil 1512 }) 1513 return err 1514 }) 1515 if err != nil { 1516 return err 1517 } 1518 err = ts.TopoServer().MigrateServedType(ctx, ts.TargetKeyspaceName(), ts.TargetShards(), ts.SourceShards(), topodatapb.TabletType_PRIMARY, nil) 1519 if err != nil { 1520 return err 1521 } 1522 if err := ts.TopoServer().ValidateSrvKeyspace(ctx, ts.TargetKeyspaceName(), ""); err != nil { 1523 err2 := vterrors.Wrapf(err, "After changing shard routes, found SrvKeyspace for %s is corrupt", ts.TargetKeyspaceName()) 1524 log.Errorf("%w", err2) 1525 return err2 1526 } 1527 return nil 1528 } 1529 1530 func (ts *trafficSwitcher) deleteShardRoutingRules(ctx context.Context) error { 1531 if !ts.isPartialMigration { 1532 return nil 1533 } 1534 srr, err := topotools.GetShardRoutingRules(ctx, ts.TopoServer()) 1535 if err != nil { 1536 return err 1537 } 1538 for _, si := range ts.TargetShards() { 1539 delete(srr, fmt.Sprintf("%s.%s", ts.targetKeyspace, si.ShardName())) 1540 } 1541 if err := topotools.SaveShardRoutingRules(ctx, ts.TopoServer(), srr); err != nil { 1542 return err 1543 } 1544 return nil 1545 } 1546 1547 func (ts *trafficSwitcher) startReverseVReplication(ctx context.Context) error { 1548 return ts.ForAllSources(func(source *workflow.MigrationSource) error { 1549 query := fmt.Sprintf("update _vt.vreplication set state='Running', message='' where db_name=%s", encodeString(source.GetPrimary().DbName())) 1550 _, err := ts.VReplicationExec(ctx, source.GetPrimary().Alias, query) 1551 return err 1552 }) 1553 } 1554 1555 func (ts *trafficSwitcher) changeShardsAccess(ctx context.Context, keyspace string, shards []*topo.ShardInfo, access accessType) error { 1556 if err := ts.TopoServer().UpdateDisableQueryService(ctx, keyspace, shards, topodatapb.TabletType_PRIMARY, nil, access == disallowWrites /* disable */); err != nil { 1557 return err 1558 } 1559 return ts.wr.refreshPrimaryTablets(ctx, shards) 1560 } 1561 1562 func (ts *trafficSwitcher) SourceShards() []*topo.ShardInfo { 1563 shards := make([]*topo.ShardInfo, 0, len(ts.Sources())) 1564 for _, source := range ts.Sources() { 1565 shards = append(shards, source.GetShard()) 1566 } 1567 return shards 1568 } 1569 1570 func (ts *trafficSwitcher) TargetShards() []*topo.ShardInfo { 1571 shards := make([]*topo.ShardInfo, 0, len(ts.Targets())) 1572 for _, target := range ts.Targets() { 1573 shards = append(shards, target.GetShard()) 1574 } 1575 return shards 1576 } 1577 1578 func (ts *trafficSwitcher) dropSourceDeniedTables(ctx context.Context) error { 1579 return ts.ForAllSources(func(source *workflow.MigrationSource) error { 1580 if _, err := ts.TopoServer().UpdateShardFields(ctx, ts.SourceKeyspaceName(), source.GetShard().ShardName(), func(si *topo.ShardInfo) error { 1581 return si.UpdateSourceDeniedTables(ctx, topodatapb.TabletType_PRIMARY, nil, true, ts.Tables()) 1582 }); err != nil { 1583 return err 1584 } 1585 rtbsCtx, cancel := context.WithTimeout(ctx, shardTabletRefreshTimeout) 1586 defer cancel() 1587 _, _, err := topotools.RefreshTabletsByShard(rtbsCtx, ts.TopoServer(), ts.TabletManagerClient(), source.GetShard(), nil, ts.Logger()) 1588 return err 1589 }) 1590 } 1591 1592 func (ts *trafficSwitcher) validateWorkflowHasCompleted(ctx context.Context) error { 1593 return doValidateWorkflowHasCompleted(ctx, ts) 1594 } 1595 1596 func doValidateWorkflowHasCompleted(ctx context.Context, ts *trafficSwitcher) error { 1597 wg := sync.WaitGroup{} 1598 rec := concurrency.AllErrorRecorder{} 1599 if ts.MigrationType() == binlogdatapb.MigrationType_SHARDS { 1600 _ = ts.ForAllSources(func(source *workflow.MigrationSource) error { 1601 wg.Add(1) 1602 if source.GetShard().IsPrimaryServing { 1603 rec.RecordError(fmt.Errorf(fmt.Sprintf("Shard %s is still serving", source.GetShard().ShardName()))) 1604 } 1605 wg.Done() 1606 return nil 1607 }) 1608 } else { 1609 _ = ts.ForAllTargets(func(target *workflow.MigrationTarget) error { 1610 wg.Add(1) 1611 query := fmt.Sprintf("select 1 from _vt.vreplication where db_name='%s' and workflow='%s' and message!='FROZEN'", target.GetPrimary().DbName(), ts.WorkflowName()) 1612 rs, _ := ts.VReplicationExec(ctx, target.GetPrimary().Alias, query) 1613 if len(rs.Rows) > 0 { 1614 rec.RecordError(fmt.Errorf("vreplication streams are not frozen on tablet %d", target.GetPrimary().Alias.Uid)) 1615 } 1616 wg.Done() 1617 return nil 1618 }) 1619 } 1620 wg.Wait() 1621 1622 if !ts.keepRoutingRules { 1623 //check if table is routable 1624 if ts.MigrationType() == binlogdatapb.MigrationType_TABLES { 1625 rules, err := topotools.GetRoutingRules(ctx, ts.TopoServer()) 1626 if err != nil { 1627 rec.RecordError(fmt.Errorf("could not get RoutingRules")) 1628 } 1629 for fromTable, toTables := range rules { 1630 for _, toTable := range toTables { 1631 for _, table := range ts.Tables() { 1632 if toTable == fmt.Sprintf("%s.%s", ts.SourceKeyspaceName(), table) { 1633 rec.RecordError(fmt.Errorf("routing still exists from keyspace %s table %s to %s", ts.SourceKeyspaceName(), table, fromTable)) 1634 } 1635 } 1636 } 1637 } 1638 } 1639 } 1640 if rec.HasErrors() { 1641 return fmt.Errorf("%s", strings.Join(rec.ErrorStrings(), "\n")) 1642 } 1643 return nil 1644 1645 } 1646 1647 func getRenameFileName(tableName string) string { 1648 return fmt.Sprintf(renameTableTemplate, tableName) 1649 } 1650 1651 func (ts *trafficSwitcher) removeSourceTables(ctx context.Context, removalType workflow.TableRemovalType) error { 1652 err := ts.ForAllSources(func(source *workflow.MigrationSource) error { 1653 for _, tableName := range ts.Tables() { 1654 query := fmt.Sprintf("drop table %s.%s", 1655 sqlescape.EscapeID(sqlescape.UnescapeID(source.GetPrimary().DbName())), 1656 sqlescape.EscapeID(sqlescape.UnescapeID(tableName))) 1657 if removalType == workflow.DropTable { 1658 ts.Logger().Infof("%s: Dropping table %s.%s\n", 1659 source.GetPrimary().String(), source.GetPrimary().DbName(), tableName) 1660 } else { 1661 renameName := getRenameFileName(tableName) 1662 ts.Logger().Infof("%s: Renaming table %s.%s to %s.%s\n", 1663 source.GetPrimary().String(), source.GetPrimary().DbName(), tableName, source.GetPrimary().DbName(), renameName) 1664 query = fmt.Sprintf("rename table %s.%s TO %s.%s", 1665 sqlescape.EscapeID(sqlescape.UnescapeID(source.GetPrimary().DbName())), 1666 sqlescape.EscapeID(sqlescape.UnescapeID(tableName)), 1667 sqlescape.EscapeID(sqlescape.UnescapeID(source.GetPrimary().DbName())), 1668 sqlescape.EscapeID(sqlescape.UnescapeID(renameName))) 1669 } 1670 _, err := ts.wr.ExecuteFetchAsDba(ctx, source.GetPrimary().Alias, query, 1, false, true) 1671 if err != nil { 1672 ts.Logger().Errorf("%s: Error removing table %s: %v", source.GetPrimary().String(), tableName, err) 1673 return err 1674 } 1675 ts.Logger().Infof("%s: Removed table %s.%s\n", source.GetPrimary().String(), source.GetPrimary().DbName(), tableName) 1676 1677 } 1678 return nil 1679 }) 1680 if err != nil { 1681 return err 1682 } 1683 1684 return ts.dropParticipatingTablesFromKeyspace(ctx, ts.SourceKeyspaceName()) 1685 } 1686 1687 func (ts *trafficSwitcher) dropParticipatingTablesFromKeyspace(ctx context.Context, keyspace string) error { 1688 vschema, err := ts.TopoServer().GetVSchema(ctx, keyspace) 1689 if err != nil { 1690 return err 1691 } 1692 for _, tableName := range ts.Tables() { 1693 delete(vschema.Tables, tableName) 1694 } 1695 return ts.TopoServer().SaveVSchema(ctx, keyspace, vschema) 1696 } 1697 1698 // FIXME: even after dropSourceShards there are still entries in the topo, need to research and fix 1699 func (ts *trafficSwitcher) dropSourceShards(ctx context.Context) error { 1700 return ts.ForAllSources(func(source *workflow.MigrationSource) error { 1701 ts.Logger().Infof("Deleting shard %s.%s\n", source.GetShard().Keyspace(), source.GetShard().ShardName()) 1702 err := ts.wr.DeleteShard(ctx, source.GetShard().Keyspace(), source.GetShard().ShardName(), true, false) 1703 if err != nil { 1704 ts.Logger().Errorf("Error deleting shard %s: %v", source.GetShard().ShardName(), err) 1705 return err 1706 } 1707 ts.Logger().Infof("Deleted shard %s.%s\n", source.GetShard().Keyspace(), source.GetShard().ShardName()) 1708 return nil 1709 }) 1710 } 1711 1712 func (ts *trafficSwitcher) freezeTargetVReplication(ctx context.Context) error { 1713 // Mark target streams as frozen before deleting. If SwitchWrites gets 1714 // re-invoked after a freeze, it will skip all the previous steps 1715 err := ts.ForAllTargets(func(target *workflow.MigrationTarget) error { 1716 ts.Logger().Infof("Marking target streams frozen for workflow %s db_name %s", ts.WorkflowName(), target.GetPrimary().DbName()) 1717 query := fmt.Sprintf("update _vt.vreplication set message = '%s' where db_name=%s and workflow=%s", workflow.Frozen, encodeString(target.GetPrimary().DbName()), encodeString(ts.WorkflowName())) 1718 _, err := ts.TabletManagerClient().VReplicationExec(ctx, target.GetPrimary().Tablet, query) 1719 return err 1720 }) 1721 if err != nil { 1722 return err 1723 } 1724 return nil 1725 } 1726 1727 func (ts *trafficSwitcher) dropTargetVReplicationStreams(ctx context.Context) error { 1728 return ts.ForAllTargets(func(target *workflow.MigrationTarget) error { 1729 ts.Logger().Infof("Deleting target streams and related data for workflow %s db_name %s", ts.WorkflowName(), target.GetPrimary().DbName()) 1730 query := fmt.Sprintf(sqlDeleteWorkflow, encodeString(target.GetPrimary().DbName()), encodeString(ts.WorkflowName())) 1731 if _, err := ts.TabletManagerClient().VReplicationExec(ctx, target.GetPrimary().Tablet, query); err != nil { 1732 return err 1733 } 1734 ts.wr.deleteWorkflowVDiffData(ctx, target.GetPrimary().Tablet, ts.WorkflowName()) 1735 ts.wr.optimizeCopyStateTable(target.GetPrimary().Tablet) 1736 return nil 1737 }) 1738 } 1739 1740 func (ts *trafficSwitcher) dropSourceReverseVReplicationStreams(ctx context.Context) error { 1741 return ts.ForAllSources(func(source *workflow.MigrationSource) error { 1742 ts.Logger().Infof("Deleting reverse streams and related data for workflow %s db_name %s", ts.WorkflowName(), source.GetPrimary().DbName()) 1743 query := fmt.Sprintf(sqlDeleteWorkflow, encodeString(source.GetPrimary().DbName()), encodeString(workflow.ReverseWorkflowName(ts.WorkflowName()))) 1744 if _, err := ts.TabletManagerClient().VReplicationExec(ctx, source.GetPrimary().Tablet, query); err != nil { 1745 return err 1746 } 1747 ts.wr.deleteWorkflowVDiffData(ctx, source.GetPrimary().Tablet, workflow.ReverseWorkflowName(ts.WorkflowName())) 1748 ts.wr.optimizeCopyStateTable(source.GetPrimary().Tablet) 1749 return nil 1750 }) 1751 } 1752 1753 func (ts *trafficSwitcher) removeTargetTables(ctx context.Context) error { 1754 log.Infof("removeTargetTables") 1755 err := ts.ForAllTargets(func(target *workflow.MigrationTarget) error { 1756 for _, tableName := range ts.Tables() { 1757 query := fmt.Sprintf("drop table %s.%s", 1758 sqlescape.EscapeID(sqlescape.UnescapeID(target.GetPrimary().DbName())), 1759 sqlescape.EscapeID(sqlescape.UnescapeID(tableName))) 1760 ts.Logger().Infof("%s: Dropping table %s.%s\n", 1761 target.GetPrimary().String(), target.GetPrimary().DbName(), tableName) 1762 _, err := ts.wr.ExecuteFetchAsDba(ctx, target.GetPrimary().Alias, query, 1, false, true) 1763 if err != nil { 1764 ts.Logger().Errorf("%s: Error removing table %s: %v", 1765 target.GetPrimary().String(), tableName, err) 1766 return err 1767 } 1768 ts.Logger().Infof("%s: Removed table %s.%s\n", 1769 target.GetPrimary().String(), target.GetPrimary().DbName(), tableName) 1770 1771 } 1772 return nil 1773 }) 1774 if err != nil { 1775 return err 1776 } 1777 1778 return ts.dropParticipatingTablesFromKeyspace(ctx, ts.TargetKeyspaceName()) 1779 1780 } 1781 1782 func (ts *trafficSwitcher) dropTargetShards(ctx context.Context) error { 1783 return ts.ForAllTargets(func(target *workflow.MigrationTarget) error { 1784 ts.Logger().Infof("Deleting shard %s.%s\n", target.GetShard().Keyspace(), target.GetShard().ShardName()) 1785 err := ts.wr.DeleteShard(ctx, target.GetShard().Keyspace(), target.GetShard().ShardName(), true, false) 1786 if err != nil { 1787 ts.Logger().Errorf("Error deleting shard %s: %v", target.GetShard().ShardName(), err) 1788 return err 1789 } 1790 ts.Logger().Infof("Deleted shard %s.%s\n", target.GetShard().Keyspace(), target.GetShard().ShardName()) 1791 return nil 1792 }) 1793 } 1794 1795 func (ts *trafficSwitcher) deleteRoutingRules(ctx context.Context) error { 1796 rules, err := topotools.GetRoutingRules(ctx, ts.TopoServer()) 1797 if err != nil { 1798 return err 1799 } 1800 for _, table := range ts.Tables() { 1801 delete(rules, table) 1802 delete(rules, table+"@replica") 1803 delete(rules, table+"@rdonly") 1804 delete(rules, ts.TargetKeyspaceName()+"."+table) 1805 delete(rules, ts.TargetKeyspaceName()+"."+table+"@replica") 1806 delete(rules, ts.TargetKeyspaceName()+"."+table+"@rdonly") 1807 delete(rules, ts.SourceKeyspaceName()+"."+table) 1808 delete(rules, ts.SourceKeyspaceName()+"."+table+"@replica") 1809 delete(rules, ts.SourceKeyspaceName()+"."+table+"@rdonly") 1810 } 1811 if err := topotools.SaveRoutingRules(ctx, ts.TopoServer(), rules); err != nil { 1812 return err 1813 } 1814 return nil 1815 } 1816 1817 // addParticipatingTablesToKeyspace updates the vschema with the new tables that were created as part of the 1818 // Migrate flow. It is called when the Migrate flow is Completed 1819 func (ts *trafficSwitcher) addParticipatingTablesToKeyspace(ctx context.Context, keyspace, tableSpecs string) error { 1820 vschema, err := ts.TopoServer().GetVSchema(ctx, keyspace) 1821 if err != nil { 1822 return err 1823 } 1824 if vschema == nil { 1825 return fmt.Errorf("no vschema found for keyspace %s", keyspace) 1826 } 1827 if vschema.Tables == nil { 1828 vschema.Tables = make(map[string]*vschemapb.Table) 1829 } 1830 if strings.HasPrefix(tableSpecs, "{") { // user defined the vschema snippet, typically for a sharded target 1831 wrap := fmt.Sprintf(`{"tables": %s}`, tableSpecs) 1832 ks := &vschemapb.Keyspace{} 1833 if err := json2.Unmarshal([]byte(wrap), ks); err != nil { 1834 return err 1835 } 1836 if err != nil { 1837 return err 1838 } 1839 for table, vtab := range ks.Tables { 1840 vschema.Tables[table] = vtab 1841 } 1842 } else { 1843 if vschema.Sharded { 1844 return fmt.Errorf("no sharded vschema was provided, so you will need to update the vschema of the target manually for the moved tables") 1845 } 1846 for _, table := range ts.tables { 1847 vschema.Tables[table] = &vschemapb.Table{} 1848 } 1849 } 1850 return ts.TopoServer().SaveVSchema(ctx, keyspace, vschema) 1851 }