vitess.io/vitess@v0.16.2/go/vt/wrangler/workflow.go (about) 1 package wrangler 2 3 import ( 4 "context" 5 "fmt" 6 "sort" 7 "strings" 8 "sync" 9 "time" 10 11 "vitess.io/vitess/go/mysql" 12 "vitess.io/vitess/go/sqltypes" 13 "vitess.io/vitess/go/vt/discovery" 14 "vitess.io/vitess/go/vt/log" 15 "vitess.io/vitess/go/vt/topo" 16 "vitess.io/vitess/go/vt/topotools" 17 "vitess.io/vitess/go/vt/vtctl/workflow" 18 "vitess.io/vitess/go/vt/vtgate/evalengine" 19 20 tabletmanagerdatapb "vitess.io/vitess/go/vt/proto/tabletmanagerdata" 21 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 22 ) 23 24 // VReplicationWorkflowType specifies whether workflow is MoveTables or Reshard 25 type VReplicationWorkflowType int 26 27 // VReplicationWorkflowType enums 28 const ( 29 MoveTablesWorkflow = VReplicationWorkflowType(iota) 30 ReshardWorkflow 31 MigrateWorkflow 32 ) 33 34 // Workflow state display strings 35 const ( 36 WorkflowStateNotCreated = "Not Created" 37 WorkflowStateNotSwitched = "Reads Not Switched. Writes Not Switched" 38 WorkflowStateReadsSwitched = "All Reads Switched. Writes Not Switched" 39 WorkflowStateWritesSwitched = "Reads Not Switched. Writes Switched" 40 WorkflowStateAllSwitched = "All Reads Switched. Writes Switched" 41 ) 42 43 // region Move Tables Public API 44 45 // VReplicationWorkflowParams stores args and options passed to a VReplicationWorkflow command 46 type VReplicationWorkflowParams struct { 47 WorkflowType VReplicationWorkflowType 48 Workflow, TargetKeyspace string 49 Cells, TabletTypes, ExcludeTables string 50 EnableReverseReplication, DryRun bool 51 KeepData bool 52 KeepRoutingRules bool 53 Timeout time.Duration 54 Direction workflow.TrafficSwitchDirection 55 MaxAllowedTransactionLagSeconds int64 56 OnDDL string 57 58 // MoveTables/Migrate specific 59 SourceKeyspace, Tables string 60 AllTables, RenameTables bool 61 SourceTimeZone string 62 DropForeignKeys bool 63 64 // Reshard specific 65 SourceShards, TargetShards []string 66 SkipSchemaCopy bool 67 AutoStart, StopAfterCopy bool 68 69 // MoveTables/Migrate and Reshard specific 70 DeferSecondaryKeys bool 71 72 // Migrate specific 73 ExternalCluster string 74 } 75 76 // VReplicationWorkflow stores various internal objects for a workflow 77 type VReplicationWorkflow struct { 78 workflowType VReplicationWorkflowType 79 ctx context.Context 80 wr *Wrangler 81 params *VReplicationWorkflowParams 82 ts *trafficSwitcher 83 ws *workflow.State 84 } 85 86 func (vrw *VReplicationWorkflow) String() string { 87 s := "" 88 s += fmt.Sprintf("Parameters: %+v\n", vrw.params) 89 s += fmt.Sprintf("State: %+v", vrw.CachedState()) 90 return s 91 } 92 93 // NewVReplicationWorkflow sets up a MoveTables or Reshard workflow based on options provided, deduces the state of the 94 // workflow from the persistent state stored in the vreplication table and the topo 95 func (wr *Wrangler) NewVReplicationWorkflow(ctx context.Context, workflowType VReplicationWorkflowType, 96 params *VReplicationWorkflowParams) (*VReplicationWorkflow, error) { 97 98 log.Infof("NewVReplicationWorkflow with params %+v", params) 99 vrw := &VReplicationWorkflow{wr: wr, ctx: ctx, params: params, workflowType: workflowType} 100 ts, ws, err := wr.getWorkflowState(ctx, params.TargetKeyspace, params.Workflow) 101 if err != nil { 102 return nil, err 103 } 104 log.Infof("Workflow state is %+v", ws) 105 if ts != nil { //Other than on create we need to get SourceKeyspace from the workflow 106 vrw.params.TargetKeyspace = ts.targetKeyspace 107 vrw.params.Workflow = ts.workflow 108 vrw.params.SourceKeyspace = ts.sourceKeyspace 109 } 110 vrw.ts = ts 111 vrw.ws = ws 112 return vrw, nil 113 } 114 115 func (vrw *VReplicationWorkflow) reloadState() (*workflow.State, error) { 116 var err error 117 vrw.ts, vrw.ws, err = vrw.wr.getWorkflowState(vrw.ctx, vrw.params.TargetKeyspace, vrw.params.Workflow) 118 return vrw.ws, err 119 } 120 121 // CurrentState reloads and returns a human readable workflow state 122 func (vrw *VReplicationWorkflow) CurrentState() string { 123 var err error 124 vrw.ws, err = vrw.reloadState() 125 if err != nil { 126 return err.Error() 127 } 128 if vrw.ws == nil { 129 return "Workflow Not Found" 130 } 131 return vrw.stateAsString(vrw.ws) 132 } 133 134 // CachedState returns a human readable workflow state at the time the workflow was created 135 func (vrw *VReplicationWorkflow) CachedState() string { 136 return vrw.stateAsString(vrw.ws) 137 } 138 139 // Exists checks if the workflow has already been initiated 140 func (vrw *VReplicationWorkflow) Exists() bool { 141 return vrw.ws != nil 142 } 143 144 func (vrw *VReplicationWorkflow) stateAsString(ws *workflow.State) string { 145 log.Infof("Workflow state is %+v", ws) 146 var stateInfo []string 147 s := "" 148 if !vrw.Exists() { 149 stateInfo = append(stateInfo, WorkflowStateNotCreated) 150 } else { 151 if !ws.IsPartialMigration { // shard level traffic switching is all or nothing 152 if len(ws.RdonlyCellsNotSwitched) == 0 && len(ws.ReplicaCellsNotSwitched) == 0 && len(ws.ReplicaCellsSwitched) > 0 { 153 s = "All Reads Switched" 154 } else if len(ws.RdonlyCellsSwitched) == 0 && len(ws.ReplicaCellsSwitched) == 0 { 155 s = "Reads Not Switched" 156 } else { 157 stateInfo = append(stateInfo, "Reads partially switched") 158 if len(ws.ReplicaCellsNotSwitched) == 0 { 159 s += "All Replica Reads Switched" 160 } else if len(ws.ReplicaCellsSwitched) == 0 { 161 s += "Replica not switched" 162 } else { 163 s += "Replica switched in cells: " + strings.Join(ws.ReplicaCellsSwitched, ",") 164 } 165 stateInfo = append(stateInfo, s) 166 s = "" 167 if len(ws.RdonlyCellsNotSwitched) == 0 { 168 s += "All Rdonly Reads Switched" 169 } else if len(ws.RdonlyCellsSwitched) == 0 { 170 s += "Rdonly not switched" 171 } else { 172 s += "Rdonly switched in cells: " + strings.Join(ws.RdonlyCellsSwitched, ",") 173 } 174 } 175 stateInfo = append(stateInfo, s) 176 } 177 if ws.WritesSwitched { 178 stateInfo = append(stateInfo, "Writes Switched") 179 } else if ws.IsPartialMigration { 180 // For partial migrations, the traffic switching is all or nothing 181 // at the shard level, so reads are effectively switched on the 182 // shard when writes are switched. 183 if len(ws.ShardsAlreadySwitched) > 0 && len(ws.ShardsNotYetSwitched) > 0 { 184 stateInfo = append(stateInfo, fmt.Sprintf("Reads partially switched, for shards: %s", strings.Join(ws.ShardsAlreadySwitched, ","))) 185 stateInfo = append(stateInfo, fmt.Sprintf("Writes partially switched, for shards: %s", strings.Join(ws.ShardsAlreadySwitched, ","))) 186 } else { 187 if len(ws.ShardsAlreadySwitched) == 0 { 188 stateInfo = append(stateInfo, "Reads Not Switched") 189 stateInfo = append(stateInfo, "Writes Not Switched") 190 } else { 191 stateInfo = append(stateInfo, "All Reads Switched") 192 stateInfo = append(stateInfo, "All Writes Switched") 193 } 194 } 195 } else { 196 stateInfo = append(stateInfo, "Writes Not Switched") 197 } 198 } 199 return strings.Join(stateInfo, ". ") 200 } 201 202 // Create initiates a workflow 203 func (vrw *VReplicationWorkflow) Create(ctx context.Context) error { 204 var err error 205 if vrw.Exists() { 206 return fmt.Errorf("workflow already exists") 207 } 208 if vrw.CachedState() != WorkflowStateNotCreated { 209 return fmt.Errorf("workflow has already been created, state is %s", vrw.CachedState()) 210 } 211 switch vrw.workflowType { 212 case MoveTablesWorkflow, MigrateWorkflow: 213 err = vrw.initMoveTables() 214 case ReshardWorkflow: 215 excludeTables := strings.Split(vrw.params.ExcludeTables, ",") 216 keyspace := vrw.params.SourceKeyspace 217 218 vschmErr := vrw.wr.ValidateVSchema(ctx, keyspace, vrw.params.SourceShards, excludeTables, true /*includeViews*/) 219 if vschmErr != nil { 220 return fmt.Errorf("Create ReshardWorkflow failed: %v", vschmErr) 221 } 222 223 err = vrw.initReshard() 224 default: 225 return fmt.Errorf("unknown workflow type %d", vrw.workflowType) 226 } 227 if err != nil { 228 return err 229 } 230 return nil 231 } 232 233 // WorkflowError has per stream errors if present in a workflow 234 type WorkflowError struct { 235 Tablet string 236 ID int64 237 Description string 238 } 239 240 // NewWorkflowError returns a new WorkflowError object 241 func NewWorkflowError(tablet string, id int64, description string) *WorkflowError { 242 wfErr := &WorkflowError{ 243 Tablet: tablet, 244 ID: id, 245 Description: description, 246 } 247 return wfErr 248 } 249 250 // GetStreamCount returns a count of total streams and of streams that have started processing 251 func (vrw *VReplicationWorkflow) GetStreamCount() (int64, int64, []*WorkflowError, error) { 252 var err error 253 var workflowErrors []*WorkflowError 254 var total, started int64 255 res, err := vrw.wr.ShowWorkflow(vrw.ctx, vrw.params.Workflow, vrw.params.TargetKeyspace) 256 if err != nil { 257 return 0, 0, nil, err 258 } 259 for ksShard := range res.ShardStatuses { 260 statuses := res.ShardStatuses[ksShard].PrimaryReplicationStatuses 261 for _, st := range statuses { 262 total++ 263 if strings.HasPrefix(st.Message, "Error:") { 264 workflowErrors = append(workflowErrors, NewWorkflowError(st.Tablet, st.ID, st.Message)) 265 continue 266 } 267 if st.Pos == "" { 268 continue 269 } 270 if st.State == "Running" || st.State == "Copying" { 271 started++ 272 } 273 } 274 } 275 276 return total, started, workflowErrors, nil 277 } 278 279 // SwitchTraffic switches traffic in the direction passed for specified tablet_types 280 func (vrw *VReplicationWorkflow) SwitchTraffic(direction workflow.TrafficSwitchDirection) (*[]string, error) { 281 var dryRunResults []string 282 var rdDryRunResults, wrDryRunResults *[]string 283 var err error 284 var hasReplica, hasRdonly, hasPrimary bool 285 286 if !vrw.Exists() { 287 return nil, fmt.Errorf("workflow has not yet been started") 288 } 289 if vrw.workflowType == MigrateWorkflow { 290 return nil, fmt.Errorf("invalid action for Migrate workflow: SwitchTraffic") 291 } 292 293 vrw.params.Direction = direction 294 295 workflowName := vrw.params.Workflow 296 keyspace := vrw.params.TargetKeyspace 297 if vrw.params.Direction == workflow.DirectionBackward { 298 workflowName = workflow.ReverseWorkflowName(workflowName) 299 keyspace = vrw.params.SourceKeyspace 300 } 301 302 reason, err := vrw.canSwitch(keyspace, workflowName) 303 if err != nil { 304 return nil, err 305 } 306 if reason != "" { 307 return nil, fmt.Errorf("cannot switch traffic for workflow %s at this time: %s", workflowName, reason) 308 } 309 310 hasReplica, hasRdonly, hasPrimary, err = vrw.parseTabletTypes() 311 if err != nil { 312 return nil, err 313 } 314 if hasReplica || hasRdonly { 315 if rdDryRunResults, err = vrw.switchReads(); err != nil { 316 return nil, err 317 } 318 } 319 if rdDryRunResults != nil { 320 dryRunResults = append(dryRunResults, *rdDryRunResults...) 321 } 322 if hasPrimary { 323 if wrDryRunResults, err = vrw.switchWrites(); err != nil { 324 return nil, err 325 } 326 } 327 if wrDryRunResults != nil { 328 dryRunResults = append(dryRunResults, *wrDryRunResults...) 329 } 330 return &dryRunResults, nil 331 } 332 333 // ReverseTraffic switches traffic backwards for tablet_types passed 334 func (vrw *VReplicationWorkflow) ReverseTraffic() (*[]string, error) { 335 if !vrw.Exists() { 336 return nil, fmt.Errorf("workflow has not yet been started") 337 } 338 if vrw.workflowType == MigrateWorkflow { 339 return nil, fmt.Errorf("invalid action for Migrate workflow: ReverseTraffic") 340 } 341 return vrw.SwitchTraffic(workflow.DirectionBackward) 342 } 343 344 // Workflow errors 345 const ( 346 ErrWorkflowNotFullySwitched = "cannot complete workflow because you have not yet switched all read and write traffic" 347 ErrWorkflowPartiallySwitched = "cannot cancel workflow because you have already switched some or all read and write traffic" 348 ) 349 350 // Complete cleans up a successful workflow 351 func (vrw *VReplicationWorkflow) Complete() (*[]string, error) { 352 var dryRunResults *[]string 353 var err error 354 ws := vrw.ws 355 356 if vrw.workflowType == MigrateWorkflow { 357 return vrw.wr.finalizeMigrateWorkflow(vrw.ctx, ws.TargetKeyspace, ws.Workflow, vrw.params.Tables, 358 false, vrw.params.KeepData, vrw.params.KeepRoutingRules, vrw.params.DryRun) 359 } 360 361 if !ws.WritesSwitched || len(ws.ReplicaCellsNotSwitched) > 0 || len(ws.RdonlyCellsNotSwitched) > 0 { 362 return nil, fmt.Errorf(ErrWorkflowNotFullySwitched) 363 } 364 var renameTable workflow.TableRemovalType 365 if vrw.params.RenameTables { 366 renameTable = workflow.RenameTable 367 } else { 368 renameTable = workflow.DropTable 369 } 370 if dryRunResults, err = vrw.wr.DropSources(vrw.ctx, vrw.ws.TargetKeyspace, vrw.ws.Workflow, renameTable, 371 vrw.params.KeepData, vrw.params.KeepRoutingRules, false /* force */, vrw.params.DryRun); err != nil { 372 return nil, err 373 } 374 return dryRunResults, nil 375 } 376 377 // Cancel deletes all artifacts from a workflow which has not yet been switched 378 func (vrw *VReplicationWorkflow) Cancel() error { 379 ws := vrw.ws 380 if vrw.workflowType == MigrateWorkflow { 381 _, err := vrw.wr.finalizeMigrateWorkflow(vrw.ctx, ws.TargetKeyspace, ws.Workflow, "", 382 true, vrw.params.KeepData, vrw.params.KeepRoutingRules, vrw.params.DryRun) 383 return err 384 } 385 386 if ws.WritesSwitched || len(ws.ReplicaCellsSwitched) > 0 || len(ws.RdonlyCellsSwitched) > 0 { 387 return fmt.Errorf(ErrWorkflowPartiallySwitched) 388 } 389 if _, err := vrw.wr.DropTargets(vrw.ctx, vrw.ws.TargetKeyspace, vrw.ws.Workflow, vrw.params.KeepData, vrw.params.KeepRoutingRules, false); err != nil { 390 return err 391 } 392 vrw.ts = nil 393 return nil 394 } 395 396 // endregion 397 398 // region Helpers 399 400 func (vrw *VReplicationWorkflow) getCellsAsArray() []string { 401 if vrw.params.Cells != "" { 402 return strings.Split(vrw.params.Cells, ",") 403 } 404 return nil 405 } 406 407 func (vrw *VReplicationWorkflow) parseTabletTypes() (hasReplica, hasRdonly, hasPrimary bool, err error) { 408 tabletTypes, _, err := discovery.ParseTabletTypesAndOrder(vrw.params.TabletTypes) 409 if err != nil { 410 return false, false, false, err 411 } 412 for _, tabletType := range tabletTypes { 413 switch tabletType { 414 case topodatapb.TabletType_REPLICA: 415 hasReplica = true 416 case topodatapb.TabletType_RDONLY: 417 hasRdonly = true 418 case topodatapb.TabletType_PRIMARY: 419 hasPrimary = true 420 default: 421 return false, false, false, fmt.Errorf("invalid tablet type passed %s", tabletType) 422 } 423 } 424 return hasReplica, hasRdonly, hasPrimary, nil 425 } 426 427 // endregion 428 429 // region Core Actions 430 431 func (vrw *VReplicationWorkflow) initMoveTables() error { 432 log.Infof("In VReplicationWorkflow.initMoveTables() for %+v", vrw) 433 return vrw.wr.MoveTables(vrw.ctx, vrw.params.Workflow, vrw.params.SourceKeyspace, vrw.params.TargetKeyspace, 434 vrw.params.Tables, vrw.params.Cells, vrw.params.TabletTypes, vrw.params.AllTables, vrw.params.ExcludeTables, 435 vrw.params.AutoStart, vrw.params.StopAfterCopy, vrw.params.ExternalCluster, vrw.params.DropForeignKeys, 436 vrw.params.DeferSecondaryKeys, vrw.params.SourceTimeZone, vrw.params.OnDDL, vrw.params.SourceShards) 437 } 438 439 func (vrw *VReplicationWorkflow) initReshard() error { 440 log.Infof("In VReplicationWorkflow.initReshard() for %+v", vrw) 441 return vrw.wr.Reshard(vrw.ctx, vrw.params.TargetKeyspace, vrw.params.Workflow, vrw.params.SourceShards, 442 vrw.params.TargetShards, vrw.params.SkipSchemaCopy, vrw.params.Cells, vrw.params.TabletTypes, 443 vrw.params.OnDDL, vrw.params.AutoStart, vrw.params.StopAfterCopy, vrw.params.DeferSecondaryKeys) 444 } 445 446 func (vrw *VReplicationWorkflow) switchReads() (*[]string, error) { 447 log.Infof("In VReplicationWorkflow.switchReads() for %+v", vrw) 448 fullTabletTypes, _, err := discovery.ParseTabletTypesAndOrder(vrw.params.TabletTypes) 449 if err != nil { 450 return nil, err 451 } 452 var nonPrimaryTabletTypes []topodatapb.TabletType 453 for _, tt := range fullTabletTypes { 454 if tt != topodatapb.TabletType_PRIMARY { 455 nonPrimaryTabletTypes = append(nonPrimaryTabletTypes, tt) 456 } 457 } 458 var dryRunResults *[]string 459 dryRunResults, err = vrw.wr.SwitchReads(vrw.ctx, vrw.params.TargetKeyspace, vrw.params.Workflow, nonPrimaryTabletTypes, 460 vrw.getCellsAsArray(), vrw.params.Direction, vrw.params.DryRun) 461 if err != nil { 462 return nil, err 463 } 464 return dryRunResults, nil 465 } 466 467 func (vrw *VReplicationWorkflow) switchWrites() (*[]string, error) { 468 var journalID int64 469 var dryRunResults *[]string 470 var err error 471 log.Infof("In VReplicationWorkflow.switchWrites() for %+v", vrw) 472 if vrw.params.Direction == workflow.DirectionBackward { 473 keyspace := vrw.params.SourceKeyspace 474 vrw.params.SourceKeyspace = vrw.params.TargetKeyspace 475 vrw.params.TargetKeyspace = keyspace 476 vrw.params.Workflow = workflow.ReverseWorkflowName(vrw.params.Workflow) 477 log.Infof("In VReplicationWorkflow.switchWrites(reverse) for %+v", vrw) 478 } 479 journalID, dryRunResults, err = vrw.wr.SwitchWrites(vrw.ctx, vrw.params.TargetKeyspace, vrw.params.Workflow, vrw.params.Timeout, 480 false, vrw.params.Direction == workflow.DirectionBackward, vrw.params.EnableReverseReplication, vrw.params.DryRun) 481 if err != nil { 482 return nil, err 483 } 484 log.Infof("switchWrites succeeded with journal id %s", journalID) 485 return dryRunResults, nil 486 } 487 488 // endregion 489 490 // region Copy Progress 491 492 // TableCopyProgress stores the row counts and disk sizes of the source and target tables 493 type TableCopyProgress struct { 494 TargetRowCount, TargetTableSize int64 495 SourceRowCount, SourceTableSize int64 496 } 497 498 // CopyProgress stores the TableCopyProgress for all tables still being copied 499 type CopyProgress map[string]*TableCopyProgress 500 501 const ( 502 cannotSwitchError = "workflow has errors" 503 cannotSwitchCopyIncomplete = "copy is still in progress" 504 cannotSwitchHighLag = "replication lag %ds is higher than allowed lag %ds" 505 cannotSwitchFailedTabletRefresh = "could not refresh all of the tablets involved in the operation:\n%s" 506 cannotSwitchFrozen = "workflow is frozen" 507 ) 508 509 func (vrw *VReplicationWorkflow) canSwitch(keyspace, workflowName string) (reason string, err error) { 510 ws, err := vrw.reloadState() 511 if err != nil { 512 return "", err 513 } 514 if vrw.params.Direction == workflow.DirectionForward && ws.WritesSwitched || 515 vrw.params.Direction == workflow.DirectionBackward && !ws.WritesSwitched { 516 log.Infof("writes already switched no need to check lag") 517 return "", nil 518 } 519 log.Infof("state:%s, direction %d, switched %t", vrw.CachedState(), vrw.params.Direction, ws.WritesSwitched) 520 result, err := vrw.wr.getStreams(vrw.ctx, workflowName, keyspace) 521 if err != nil { 522 return "", err 523 } 524 for ksShard := range result.ShardStatuses { 525 statuses := result.ShardStatuses[ksShard].PrimaryReplicationStatuses 526 for _, st := range statuses { 527 switch st.State { 528 case "Copying": 529 return cannotSwitchCopyIncomplete, nil 530 case "Error": 531 return cannotSwitchError, nil 532 } 533 } 534 } 535 if result.Frozen { 536 return cannotSwitchFrozen, nil 537 } 538 if result.MaxVReplicationTransactionLag > vrw.params.MaxAllowedTransactionLagSeconds { 539 return fmt.Sprintf(cannotSwitchHighLag, result.MaxVReplicationTransactionLag, vrw.params.MaxAllowedTransactionLagSeconds), nil 540 } 541 542 // Ensure that the tablets on both sides are in good shape as we make this same call in the process 543 // and an error will cause us to backout 544 refreshErrors := strings.Builder{} 545 var m sync.Mutex 546 var wg sync.WaitGroup 547 rtbsCtx, cancel := context.WithTimeout(vrw.ctx, shardTabletRefreshTimeout) 548 defer cancel() 549 refreshTablets := func(shards []*topo.ShardInfo, stype string) { 550 defer wg.Done() 551 for _, si := range shards { 552 if partial, partialDetails, err := topotools.RefreshTabletsByShard(rtbsCtx, vrw.wr.ts, vrw.wr.tmc, si, nil, vrw.wr.Logger()); err != nil || partial { 553 m.Lock() 554 refreshErrors.WriteString(fmt.Sprintf("failed to successfully refresh all tablets in the %s/%s %s shard (%v):\n %v\n", 555 si.Keyspace(), si.ShardName(), stype, err, partialDetails)) 556 m.Unlock() 557 } 558 } 559 } 560 wg.Add(1) 561 go refreshTablets(vrw.ts.SourceShards(), "source") 562 wg.Add(1) 563 go refreshTablets(vrw.ts.TargetShards(), "target") 564 wg.Wait() 565 if refreshErrors.Len() > 0 { 566 return fmt.Sprintf(cannotSwitchFailedTabletRefresh, refreshErrors.String()), nil 567 } 568 return "", nil 569 } 570 571 // GetCopyProgress returns the progress of all tables being copied in the workflow 572 func (vrw *VReplicationWorkflow) GetCopyProgress() (*CopyProgress, error) { 573 ctx := context.Background() 574 getTablesQuery := "select distinct table_name from _vt.copy_state cs, _vt.vreplication vr where vr.id = cs.vrepl_id and vr.id = %d" 575 getRowCountQuery := "select table_name, table_rows, data_length from information_schema.tables where table_schema = %s and table_name in (%s)" 576 tables := make(map[string]bool) 577 const MaxRows = 1000 578 sourcePrimaries := make(map[*topodatapb.TabletAlias]bool) 579 for _, target := range vrw.ts.targets { 580 for id, bls := range target.Sources { 581 query := fmt.Sprintf(getTablesQuery, id) 582 p3qr, err := vrw.wr.tmc.ExecuteFetchAsDba(ctx, target.GetPrimary().Tablet, true, &tabletmanagerdatapb.ExecuteFetchAsDbaRequest{ 583 Query: []byte(query), 584 MaxRows: MaxRows, 585 }) 586 if err != nil { 587 return nil, err 588 } 589 if len(p3qr.Rows) < 1 { 590 continue 591 } 592 qr := sqltypes.Proto3ToResult(p3qr) 593 for i := 0; i < len(p3qr.Rows); i++ { 594 tables[qr.Rows[i][0].ToString()] = true 595 } 596 sourcesi, err := vrw.wr.ts.GetShard(ctx, bls.Keyspace, bls.Shard) 597 if err != nil { 598 return nil, err 599 } 600 found := false 601 for existingSource := range sourcePrimaries { 602 if existingSource.Uid == sourcesi.PrimaryAlias.Uid { 603 found = true 604 } 605 } 606 if !found { 607 sourcePrimaries[sourcesi.PrimaryAlias] = true 608 } 609 } 610 } 611 if len(tables) == 0 { 612 return nil, nil 613 } 614 var tableList []string 615 targetRowCounts := make(map[string]int64) 616 sourceRowCounts := make(map[string]int64) 617 targetTableSizes := make(map[string]int64) 618 sourceTableSizes := make(map[string]int64) 619 620 for table := range tables { 621 tableList = append(tableList, encodeString(table)) 622 targetRowCounts[table] = 0 623 sourceRowCounts[table] = 0 624 targetTableSizes[table] = 0 625 sourceTableSizes[table] = 0 626 } 627 628 var getTableMetrics = func(tablet *topodatapb.Tablet, query string, rowCounts *map[string]int64, tableSizes *map[string]int64) error { 629 p3qr, err := vrw.wr.tmc.ExecuteFetchAsDba(ctx, tablet, true, &tabletmanagerdatapb.ExecuteFetchAsDbaRequest{ 630 Query: []byte(query), 631 MaxRows: uint64(len(tables)), 632 }) 633 if err != nil { 634 return err 635 } 636 qr := sqltypes.Proto3ToResult(p3qr) 637 for i := 0; i < len(qr.Rows); i++ { 638 table := qr.Rows[i][0].ToString() 639 rowCount, err := evalengine.ToInt64(qr.Rows[i][1]) 640 if err != nil { 641 return err 642 } 643 tableSize, err := evalengine.ToInt64(qr.Rows[i][2]) 644 if err != nil { 645 return err 646 } 647 (*rowCounts)[table] += rowCount 648 (*tableSizes)[table] += tableSize 649 } 650 return nil 651 } 652 sourceDbName := "" 653 for _, tsSource := range vrw.ts.sources { 654 sourceDbName = tsSource.GetPrimary().DbName() 655 break 656 } 657 if sourceDbName == "" { 658 return nil, fmt.Errorf("no sources found for workflow %s.%s", vrw.ws.TargetKeyspace, vrw.ws.Workflow) 659 } 660 targetDbName := "" 661 for _, tsTarget := range vrw.ts.targets { 662 targetDbName = tsTarget.GetPrimary().DbName() 663 break 664 } 665 if sourceDbName == "" || targetDbName == "" { 666 return nil, fmt.Errorf("workflow %s.%s is incorrectly configured", vrw.ws.TargetKeyspace, vrw.ws.Workflow) 667 } 668 sort.Strings(tableList) // sort list for repeatability for mocking in tests 669 tablesStr := strings.Join(tableList, ",") 670 query := fmt.Sprintf(getRowCountQuery, encodeString(targetDbName), tablesStr) 671 for _, target := range vrw.ts.targets { 672 tablet := target.GetPrimary().Tablet 673 if err := getTableMetrics(tablet, query, &targetRowCounts, &targetTableSizes); err != nil { 674 return nil, err 675 } 676 } 677 678 query = fmt.Sprintf(getRowCountQuery, encodeString(sourceDbName), tablesStr) 679 for source := range sourcePrimaries { 680 ti, err := vrw.wr.ts.GetTablet(ctx, source) 681 tablet := ti.Tablet 682 if err != nil { 683 return nil, err 684 } 685 if err := getTableMetrics(tablet, query, &sourceRowCounts, &sourceTableSizes); err != nil { 686 return nil, err 687 } 688 } 689 690 copyProgress := CopyProgress{} 691 for table, rowCount := range targetRowCounts { 692 copyProgress[table] = &TableCopyProgress{ 693 TargetRowCount: rowCount, 694 TargetTableSize: targetTableSizes[table], 695 SourceRowCount: sourceRowCounts[table], 696 SourceTableSize: sourceTableSizes[table], 697 } 698 } 699 return ©Progress, nil 700 } 701 702 // endregion 703 704 // region Workflow related utility functions 705 706 // deleteWorkflowVDiffData cleans up any potential VDiff related data associated with the workflow on the given tablet 707 func (wr *Wrangler) deleteWorkflowVDiffData(ctx context.Context, tablet *topodatapb.Tablet, workflow string) { 708 sqlDeleteVDiffs := `delete from vd, vdt, vdl using _vt.vdiff as vd inner join _vt.vdiff_table as vdt on (vd.id = vdt.vdiff_id) 709 inner join _vt.vdiff_log as vdl on (vd.id = vdl.vdiff_id) 710 where vd.keyspace = %s and vd.workflow = %s` 711 query := fmt.Sprintf(sqlDeleteVDiffs, encodeString(tablet.Keyspace), encodeString(workflow)) 712 rows := -1 713 if _, err := wr.tmc.ExecuteFetchAsDba(ctx, tablet, false, &tabletmanagerdatapb.ExecuteFetchAsDbaRequest{ 714 Query: []byte(query), 715 MaxRows: uint64(rows), 716 }); err != nil { 717 if sqlErr, ok := err.(*mysql.SQLError); ok && sqlErr.Num != mysql.ERNoSuchTable { // the tables may not exist if no vdiffs have been run 718 wr.Logger().Errorf("Error deleting vdiff data for %s.%s workflow: %v", tablet.Keyspace, workflow, err) 719 } 720 } 721 } 722 723 // optimizeCopyStateTable rebuilds the copy_state table to ensure the on-disk 724 // structures are minimal and optimized and resets the auto-inc value for 725 // subsequent inserts. 726 // This helps to ensure that the size, storage, and performance related factors 727 // for the table remain optimal over time and that we don't ever exhaust the 728 // available auto-inc values for the table. 729 // Note: it's not critical that this executes successfully any given time, it's 730 // only important that we try to do this periodically so that things stay in an 731 // optimal state over long periods of time. For this reason, the work is done 732 // asynchronously in the background on the given tablet and any failures are 733 // logged as warnings. Because it's done in the background we use the AllPrivs 734 // account to be sure that we don't execute the writes if READ_ONLY is set on 735 // the MySQL instance. 736 func (wr *Wrangler) optimizeCopyStateTable(tablet *topodatapb.Tablet) { 737 if wr.sem != nil { 738 if !wr.sem.TryAcquire() { 739 log.Warningf("Deferring work to optimize the copy_state table on %q due to hitting the maximum concurrent background job limit.", 740 tablet.Alias.String()) 741 return 742 } 743 } 744 go func() { 745 defer func() { 746 if wr.sem != nil { 747 wr.sem.Release() 748 } 749 }() 750 ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute) 751 defer cancel() 752 sqlOptimizeTable := "optimize table _vt.copy_state" 753 if _, err := wr.tmc.ExecuteFetchAsAllPrivs(ctx, tablet, &tabletmanagerdatapb.ExecuteFetchAsAllPrivsRequest{ 754 Query: []byte(sqlOptimizeTable), 755 MaxRows: uint64(100), // always produces 1+rows with notes and status 756 }); err != nil { 757 if sqlErr, ok := err.(*mysql.SQLError); ok && sqlErr.Num == mysql.ERNoSuchTable { // the table may not exist 758 return 759 } 760 log.Warningf("Failed to optimize the copy_state table on %q: %v", tablet.Alias.String(), err) 761 } 762 // This will automatically set the value to 1 or the current max value in the table, whichever is greater 763 sqlResetAutoInc := "alter table _vt.copy_state auto_increment = 1" 764 if _, err := wr.tmc.ExecuteFetchAsAllPrivs(ctx, tablet, &tabletmanagerdatapb.ExecuteFetchAsAllPrivsRequest{ 765 Query: []byte(sqlResetAutoInc), 766 MaxRows: uint64(0), 767 }); err != nil { 768 log.Warningf("Failed to reset the auto_increment value for the copy_state table on %q: %v", 769 tablet.Alias.String(), err) 770 } 771 }() 772 } 773 774 // endregion