vitess.io/vitess@v0.16.2/go/vt/wrangler/workflow.go (about)

     1  package wrangler
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"sort"
     7  	"strings"
     8  	"sync"
     9  	"time"
    10  
    11  	"vitess.io/vitess/go/mysql"
    12  	"vitess.io/vitess/go/sqltypes"
    13  	"vitess.io/vitess/go/vt/discovery"
    14  	"vitess.io/vitess/go/vt/log"
    15  	"vitess.io/vitess/go/vt/topo"
    16  	"vitess.io/vitess/go/vt/topotools"
    17  	"vitess.io/vitess/go/vt/vtctl/workflow"
    18  	"vitess.io/vitess/go/vt/vtgate/evalengine"
    19  
    20  	tabletmanagerdatapb "vitess.io/vitess/go/vt/proto/tabletmanagerdata"
    21  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    22  )
    23  
    24  // VReplicationWorkflowType specifies whether workflow is MoveTables or Reshard
    25  type VReplicationWorkflowType int
    26  
    27  // VReplicationWorkflowType enums
    28  const (
    29  	MoveTablesWorkflow = VReplicationWorkflowType(iota)
    30  	ReshardWorkflow
    31  	MigrateWorkflow
    32  )
    33  
    34  // Workflow state display strings
    35  const (
    36  	WorkflowStateNotCreated     = "Not Created"
    37  	WorkflowStateNotSwitched    = "Reads Not Switched. Writes Not Switched"
    38  	WorkflowStateReadsSwitched  = "All Reads Switched. Writes Not Switched"
    39  	WorkflowStateWritesSwitched = "Reads Not Switched. Writes Switched"
    40  	WorkflowStateAllSwitched    = "All Reads Switched. Writes Switched"
    41  )
    42  
    43  // region Move Tables Public API
    44  
    45  // VReplicationWorkflowParams stores args and options passed to a VReplicationWorkflow command
    46  type VReplicationWorkflowParams struct {
    47  	WorkflowType                      VReplicationWorkflowType
    48  	Workflow, TargetKeyspace          string
    49  	Cells, TabletTypes, ExcludeTables string
    50  	EnableReverseReplication, DryRun  bool
    51  	KeepData                          bool
    52  	KeepRoutingRules                  bool
    53  	Timeout                           time.Duration
    54  	Direction                         workflow.TrafficSwitchDirection
    55  	MaxAllowedTransactionLagSeconds   int64
    56  	OnDDL                             string
    57  
    58  	// MoveTables/Migrate specific
    59  	SourceKeyspace, Tables  string
    60  	AllTables, RenameTables bool
    61  	SourceTimeZone          string
    62  	DropForeignKeys         bool
    63  
    64  	// Reshard specific
    65  	SourceShards, TargetShards []string
    66  	SkipSchemaCopy             bool
    67  	AutoStart, StopAfterCopy   bool
    68  
    69  	// MoveTables/Migrate and Reshard specific
    70  	DeferSecondaryKeys bool
    71  
    72  	// Migrate specific
    73  	ExternalCluster string
    74  }
    75  
    76  // VReplicationWorkflow stores various internal objects for a workflow
    77  type VReplicationWorkflow struct {
    78  	workflowType VReplicationWorkflowType
    79  	ctx          context.Context
    80  	wr           *Wrangler
    81  	params       *VReplicationWorkflowParams
    82  	ts           *trafficSwitcher
    83  	ws           *workflow.State
    84  }
    85  
    86  func (vrw *VReplicationWorkflow) String() string {
    87  	s := ""
    88  	s += fmt.Sprintf("Parameters: %+v\n", vrw.params)
    89  	s += fmt.Sprintf("State: %+v", vrw.CachedState())
    90  	return s
    91  }
    92  
    93  // NewVReplicationWorkflow sets up a MoveTables or Reshard workflow based on options provided, deduces the state of the
    94  // workflow from the persistent state stored in the vreplication table and the topo
    95  func (wr *Wrangler) NewVReplicationWorkflow(ctx context.Context, workflowType VReplicationWorkflowType,
    96  	params *VReplicationWorkflowParams) (*VReplicationWorkflow, error) {
    97  
    98  	log.Infof("NewVReplicationWorkflow with params %+v", params)
    99  	vrw := &VReplicationWorkflow{wr: wr, ctx: ctx, params: params, workflowType: workflowType}
   100  	ts, ws, err := wr.getWorkflowState(ctx, params.TargetKeyspace, params.Workflow)
   101  	if err != nil {
   102  		return nil, err
   103  	}
   104  	log.Infof("Workflow state is %+v", ws)
   105  	if ts != nil { //Other than on create we need to get SourceKeyspace from the workflow
   106  		vrw.params.TargetKeyspace = ts.targetKeyspace
   107  		vrw.params.Workflow = ts.workflow
   108  		vrw.params.SourceKeyspace = ts.sourceKeyspace
   109  	}
   110  	vrw.ts = ts
   111  	vrw.ws = ws
   112  	return vrw, nil
   113  }
   114  
   115  func (vrw *VReplicationWorkflow) reloadState() (*workflow.State, error) {
   116  	var err error
   117  	vrw.ts, vrw.ws, err = vrw.wr.getWorkflowState(vrw.ctx, vrw.params.TargetKeyspace, vrw.params.Workflow)
   118  	return vrw.ws, err
   119  }
   120  
   121  // CurrentState reloads and returns a human readable workflow state
   122  func (vrw *VReplicationWorkflow) CurrentState() string {
   123  	var err error
   124  	vrw.ws, err = vrw.reloadState()
   125  	if err != nil {
   126  		return err.Error()
   127  	}
   128  	if vrw.ws == nil {
   129  		return "Workflow Not Found"
   130  	}
   131  	return vrw.stateAsString(vrw.ws)
   132  }
   133  
   134  // CachedState returns a human readable workflow state at the time the workflow was created
   135  func (vrw *VReplicationWorkflow) CachedState() string {
   136  	return vrw.stateAsString(vrw.ws)
   137  }
   138  
   139  // Exists checks if the workflow has already been initiated
   140  func (vrw *VReplicationWorkflow) Exists() bool {
   141  	return vrw.ws != nil
   142  }
   143  
   144  func (vrw *VReplicationWorkflow) stateAsString(ws *workflow.State) string {
   145  	log.Infof("Workflow state is %+v", ws)
   146  	var stateInfo []string
   147  	s := ""
   148  	if !vrw.Exists() {
   149  		stateInfo = append(stateInfo, WorkflowStateNotCreated)
   150  	} else {
   151  		if !ws.IsPartialMigration { // shard level traffic switching is all or nothing
   152  			if len(ws.RdonlyCellsNotSwitched) == 0 && len(ws.ReplicaCellsNotSwitched) == 0 && len(ws.ReplicaCellsSwitched) > 0 {
   153  				s = "All Reads Switched"
   154  			} else if len(ws.RdonlyCellsSwitched) == 0 && len(ws.ReplicaCellsSwitched) == 0 {
   155  				s = "Reads Not Switched"
   156  			} else {
   157  				stateInfo = append(stateInfo, "Reads partially switched")
   158  				if len(ws.ReplicaCellsNotSwitched) == 0 {
   159  					s += "All Replica Reads Switched"
   160  				} else if len(ws.ReplicaCellsSwitched) == 0 {
   161  					s += "Replica not switched"
   162  				} else {
   163  					s += "Replica switched in cells: " + strings.Join(ws.ReplicaCellsSwitched, ",")
   164  				}
   165  				stateInfo = append(stateInfo, s)
   166  				s = ""
   167  				if len(ws.RdonlyCellsNotSwitched) == 0 {
   168  					s += "All Rdonly Reads Switched"
   169  				} else if len(ws.RdonlyCellsSwitched) == 0 {
   170  					s += "Rdonly not switched"
   171  				} else {
   172  					s += "Rdonly switched in cells: " + strings.Join(ws.RdonlyCellsSwitched, ",")
   173  				}
   174  			}
   175  			stateInfo = append(stateInfo, s)
   176  		}
   177  		if ws.WritesSwitched {
   178  			stateInfo = append(stateInfo, "Writes Switched")
   179  		} else if ws.IsPartialMigration {
   180  			// For partial migrations, the traffic switching is all or nothing
   181  			// at the shard level, so reads are effectively switched on the
   182  			// shard when writes are switched.
   183  			if len(ws.ShardsAlreadySwitched) > 0 && len(ws.ShardsNotYetSwitched) > 0 {
   184  				stateInfo = append(stateInfo, fmt.Sprintf("Reads partially switched, for shards: %s", strings.Join(ws.ShardsAlreadySwitched, ",")))
   185  				stateInfo = append(stateInfo, fmt.Sprintf("Writes partially switched, for shards: %s", strings.Join(ws.ShardsAlreadySwitched, ",")))
   186  			} else {
   187  				if len(ws.ShardsAlreadySwitched) == 0 {
   188  					stateInfo = append(stateInfo, "Reads Not Switched")
   189  					stateInfo = append(stateInfo, "Writes Not Switched")
   190  				} else {
   191  					stateInfo = append(stateInfo, "All Reads Switched")
   192  					stateInfo = append(stateInfo, "All Writes Switched")
   193  				}
   194  			}
   195  		} else {
   196  			stateInfo = append(stateInfo, "Writes Not Switched")
   197  		}
   198  	}
   199  	return strings.Join(stateInfo, ". ")
   200  }
   201  
   202  // Create initiates a workflow
   203  func (vrw *VReplicationWorkflow) Create(ctx context.Context) error {
   204  	var err error
   205  	if vrw.Exists() {
   206  		return fmt.Errorf("workflow already exists")
   207  	}
   208  	if vrw.CachedState() != WorkflowStateNotCreated {
   209  		return fmt.Errorf("workflow has already been created, state is %s", vrw.CachedState())
   210  	}
   211  	switch vrw.workflowType {
   212  	case MoveTablesWorkflow, MigrateWorkflow:
   213  		err = vrw.initMoveTables()
   214  	case ReshardWorkflow:
   215  		excludeTables := strings.Split(vrw.params.ExcludeTables, ",")
   216  		keyspace := vrw.params.SourceKeyspace
   217  
   218  		vschmErr := vrw.wr.ValidateVSchema(ctx, keyspace, vrw.params.SourceShards, excludeTables, true /*includeViews*/)
   219  		if vschmErr != nil {
   220  			return fmt.Errorf("Create ReshardWorkflow failed: %v", vschmErr)
   221  		}
   222  
   223  		err = vrw.initReshard()
   224  	default:
   225  		return fmt.Errorf("unknown workflow type %d", vrw.workflowType)
   226  	}
   227  	if err != nil {
   228  		return err
   229  	}
   230  	return nil
   231  }
   232  
   233  // WorkflowError has per stream errors if present in a workflow
   234  type WorkflowError struct {
   235  	Tablet      string
   236  	ID          int64
   237  	Description string
   238  }
   239  
   240  // NewWorkflowError returns a new WorkflowError object
   241  func NewWorkflowError(tablet string, id int64, description string) *WorkflowError {
   242  	wfErr := &WorkflowError{
   243  		Tablet:      tablet,
   244  		ID:          id,
   245  		Description: description,
   246  	}
   247  	return wfErr
   248  }
   249  
   250  // GetStreamCount returns a count of total streams and of streams that have started processing
   251  func (vrw *VReplicationWorkflow) GetStreamCount() (int64, int64, []*WorkflowError, error) {
   252  	var err error
   253  	var workflowErrors []*WorkflowError
   254  	var total, started int64
   255  	res, err := vrw.wr.ShowWorkflow(vrw.ctx, vrw.params.Workflow, vrw.params.TargetKeyspace)
   256  	if err != nil {
   257  		return 0, 0, nil, err
   258  	}
   259  	for ksShard := range res.ShardStatuses {
   260  		statuses := res.ShardStatuses[ksShard].PrimaryReplicationStatuses
   261  		for _, st := range statuses {
   262  			total++
   263  			if strings.HasPrefix(st.Message, "Error:") {
   264  				workflowErrors = append(workflowErrors, NewWorkflowError(st.Tablet, st.ID, st.Message))
   265  				continue
   266  			}
   267  			if st.Pos == "" {
   268  				continue
   269  			}
   270  			if st.State == "Running" || st.State == "Copying" {
   271  				started++
   272  			}
   273  		}
   274  	}
   275  
   276  	return total, started, workflowErrors, nil
   277  }
   278  
   279  // SwitchTraffic switches traffic in the direction passed for specified tablet_types
   280  func (vrw *VReplicationWorkflow) SwitchTraffic(direction workflow.TrafficSwitchDirection) (*[]string, error) {
   281  	var dryRunResults []string
   282  	var rdDryRunResults, wrDryRunResults *[]string
   283  	var err error
   284  	var hasReplica, hasRdonly, hasPrimary bool
   285  
   286  	if !vrw.Exists() {
   287  		return nil, fmt.Errorf("workflow has not yet been started")
   288  	}
   289  	if vrw.workflowType == MigrateWorkflow {
   290  		return nil, fmt.Errorf("invalid action for Migrate workflow: SwitchTraffic")
   291  	}
   292  
   293  	vrw.params.Direction = direction
   294  
   295  	workflowName := vrw.params.Workflow
   296  	keyspace := vrw.params.TargetKeyspace
   297  	if vrw.params.Direction == workflow.DirectionBackward {
   298  		workflowName = workflow.ReverseWorkflowName(workflowName)
   299  		keyspace = vrw.params.SourceKeyspace
   300  	}
   301  
   302  	reason, err := vrw.canSwitch(keyspace, workflowName)
   303  	if err != nil {
   304  		return nil, err
   305  	}
   306  	if reason != "" {
   307  		return nil, fmt.Errorf("cannot switch traffic for workflow %s at this time: %s", workflowName, reason)
   308  	}
   309  
   310  	hasReplica, hasRdonly, hasPrimary, err = vrw.parseTabletTypes()
   311  	if err != nil {
   312  		return nil, err
   313  	}
   314  	if hasReplica || hasRdonly {
   315  		if rdDryRunResults, err = vrw.switchReads(); err != nil {
   316  			return nil, err
   317  		}
   318  	}
   319  	if rdDryRunResults != nil {
   320  		dryRunResults = append(dryRunResults, *rdDryRunResults...)
   321  	}
   322  	if hasPrimary {
   323  		if wrDryRunResults, err = vrw.switchWrites(); err != nil {
   324  			return nil, err
   325  		}
   326  	}
   327  	if wrDryRunResults != nil {
   328  		dryRunResults = append(dryRunResults, *wrDryRunResults...)
   329  	}
   330  	return &dryRunResults, nil
   331  }
   332  
   333  // ReverseTraffic switches traffic backwards for tablet_types passed
   334  func (vrw *VReplicationWorkflow) ReverseTraffic() (*[]string, error) {
   335  	if !vrw.Exists() {
   336  		return nil, fmt.Errorf("workflow has not yet been started")
   337  	}
   338  	if vrw.workflowType == MigrateWorkflow {
   339  		return nil, fmt.Errorf("invalid action for Migrate workflow: ReverseTraffic")
   340  	}
   341  	return vrw.SwitchTraffic(workflow.DirectionBackward)
   342  }
   343  
   344  // Workflow errors
   345  const (
   346  	ErrWorkflowNotFullySwitched  = "cannot complete workflow because you have not yet switched all read and write traffic"
   347  	ErrWorkflowPartiallySwitched = "cannot cancel workflow because you have already switched some or all read and write traffic"
   348  )
   349  
   350  // Complete cleans up a successful workflow
   351  func (vrw *VReplicationWorkflow) Complete() (*[]string, error) {
   352  	var dryRunResults *[]string
   353  	var err error
   354  	ws := vrw.ws
   355  
   356  	if vrw.workflowType == MigrateWorkflow {
   357  		return vrw.wr.finalizeMigrateWorkflow(vrw.ctx, ws.TargetKeyspace, ws.Workflow, vrw.params.Tables,
   358  			false, vrw.params.KeepData, vrw.params.KeepRoutingRules, vrw.params.DryRun)
   359  	}
   360  
   361  	if !ws.WritesSwitched || len(ws.ReplicaCellsNotSwitched) > 0 || len(ws.RdonlyCellsNotSwitched) > 0 {
   362  		return nil, fmt.Errorf(ErrWorkflowNotFullySwitched)
   363  	}
   364  	var renameTable workflow.TableRemovalType
   365  	if vrw.params.RenameTables {
   366  		renameTable = workflow.RenameTable
   367  	} else {
   368  		renameTable = workflow.DropTable
   369  	}
   370  	if dryRunResults, err = vrw.wr.DropSources(vrw.ctx, vrw.ws.TargetKeyspace, vrw.ws.Workflow, renameTable,
   371  		vrw.params.KeepData, vrw.params.KeepRoutingRules, false /* force */, vrw.params.DryRun); err != nil {
   372  		return nil, err
   373  	}
   374  	return dryRunResults, nil
   375  }
   376  
   377  // Cancel deletes all artifacts from a workflow which has not yet been switched
   378  func (vrw *VReplicationWorkflow) Cancel() error {
   379  	ws := vrw.ws
   380  	if vrw.workflowType == MigrateWorkflow {
   381  		_, err := vrw.wr.finalizeMigrateWorkflow(vrw.ctx, ws.TargetKeyspace, ws.Workflow, "",
   382  			true, vrw.params.KeepData, vrw.params.KeepRoutingRules, vrw.params.DryRun)
   383  		return err
   384  	}
   385  
   386  	if ws.WritesSwitched || len(ws.ReplicaCellsSwitched) > 0 || len(ws.RdonlyCellsSwitched) > 0 {
   387  		return fmt.Errorf(ErrWorkflowPartiallySwitched)
   388  	}
   389  	if _, err := vrw.wr.DropTargets(vrw.ctx, vrw.ws.TargetKeyspace, vrw.ws.Workflow, vrw.params.KeepData, vrw.params.KeepRoutingRules, false); err != nil {
   390  		return err
   391  	}
   392  	vrw.ts = nil
   393  	return nil
   394  }
   395  
   396  // endregion
   397  
   398  // region Helpers
   399  
   400  func (vrw *VReplicationWorkflow) getCellsAsArray() []string {
   401  	if vrw.params.Cells != "" {
   402  		return strings.Split(vrw.params.Cells, ",")
   403  	}
   404  	return nil
   405  }
   406  
   407  func (vrw *VReplicationWorkflow) parseTabletTypes() (hasReplica, hasRdonly, hasPrimary bool, err error) {
   408  	tabletTypes, _, err := discovery.ParseTabletTypesAndOrder(vrw.params.TabletTypes)
   409  	if err != nil {
   410  		return false, false, false, err
   411  	}
   412  	for _, tabletType := range tabletTypes {
   413  		switch tabletType {
   414  		case topodatapb.TabletType_REPLICA:
   415  			hasReplica = true
   416  		case topodatapb.TabletType_RDONLY:
   417  			hasRdonly = true
   418  		case topodatapb.TabletType_PRIMARY:
   419  			hasPrimary = true
   420  		default:
   421  			return false, false, false, fmt.Errorf("invalid tablet type passed %s", tabletType)
   422  		}
   423  	}
   424  	return hasReplica, hasRdonly, hasPrimary, nil
   425  }
   426  
   427  // endregion
   428  
   429  // region Core Actions
   430  
   431  func (vrw *VReplicationWorkflow) initMoveTables() error {
   432  	log.Infof("In VReplicationWorkflow.initMoveTables() for %+v", vrw)
   433  	return vrw.wr.MoveTables(vrw.ctx, vrw.params.Workflow, vrw.params.SourceKeyspace, vrw.params.TargetKeyspace,
   434  		vrw.params.Tables, vrw.params.Cells, vrw.params.TabletTypes, vrw.params.AllTables, vrw.params.ExcludeTables,
   435  		vrw.params.AutoStart, vrw.params.StopAfterCopy, vrw.params.ExternalCluster, vrw.params.DropForeignKeys,
   436  		vrw.params.DeferSecondaryKeys, vrw.params.SourceTimeZone, vrw.params.OnDDL, vrw.params.SourceShards)
   437  }
   438  
   439  func (vrw *VReplicationWorkflow) initReshard() error {
   440  	log.Infof("In VReplicationWorkflow.initReshard() for %+v", vrw)
   441  	return vrw.wr.Reshard(vrw.ctx, vrw.params.TargetKeyspace, vrw.params.Workflow, vrw.params.SourceShards,
   442  		vrw.params.TargetShards, vrw.params.SkipSchemaCopy, vrw.params.Cells, vrw.params.TabletTypes,
   443  		vrw.params.OnDDL, vrw.params.AutoStart, vrw.params.StopAfterCopy, vrw.params.DeferSecondaryKeys)
   444  }
   445  
   446  func (vrw *VReplicationWorkflow) switchReads() (*[]string, error) {
   447  	log.Infof("In VReplicationWorkflow.switchReads() for %+v", vrw)
   448  	fullTabletTypes, _, err := discovery.ParseTabletTypesAndOrder(vrw.params.TabletTypes)
   449  	if err != nil {
   450  		return nil, err
   451  	}
   452  	var nonPrimaryTabletTypes []topodatapb.TabletType
   453  	for _, tt := range fullTabletTypes {
   454  		if tt != topodatapb.TabletType_PRIMARY {
   455  			nonPrimaryTabletTypes = append(nonPrimaryTabletTypes, tt)
   456  		}
   457  	}
   458  	var dryRunResults *[]string
   459  	dryRunResults, err = vrw.wr.SwitchReads(vrw.ctx, vrw.params.TargetKeyspace, vrw.params.Workflow, nonPrimaryTabletTypes,
   460  		vrw.getCellsAsArray(), vrw.params.Direction, vrw.params.DryRun)
   461  	if err != nil {
   462  		return nil, err
   463  	}
   464  	return dryRunResults, nil
   465  }
   466  
   467  func (vrw *VReplicationWorkflow) switchWrites() (*[]string, error) {
   468  	var journalID int64
   469  	var dryRunResults *[]string
   470  	var err error
   471  	log.Infof("In VReplicationWorkflow.switchWrites() for %+v", vrw)
   472  	if vrw.params.Direction == workflow.DirectionBackward {
   473  		keyspace := vrw.params.SourceKeyspace
   474  		vrw.params.SourceKeyspace = vrw.params.TargetKeyspace
   475  		vrw.params.TargetKeyspace = keyspace
   476  		vrw.params.Workflow = workflow.ReverseWorkflowName(vrw.params.Workflow)
   477  		log.Infof("In VReplicationWorkflow.switchWrites(reverse) for %+v", vrw)
   478  	}
   479  	journalID, dryRunResults, err = vrw.wr.SwitchWrites(vrw.ctx, vrw.params.TargetKeyspace, vrw.params.Workflow, vrw.params.Timeout,
   480  		false, vrw.params.Direction == workflow.DirectionBackward, vrw.params.EnableReverseReplication, vrw.params.DryRun)
   481  	if err != nil {
   482  		return nil, err
   483  	}
   484  	log.Infof("switchWrites succeeded with journal id %s", journalID)
   485  	return dryRunResults, nil
   486  }
   487  
   488  // endregion
   489  
   490  // region Copy Progress
   491  
   492  // TableCopyProgress stores the row counts and disk sizes of the source and target tables
   493  type TableCopyProgress struct {
   494  	TargetRowCount, TargetTableSize int64
   495  	SourceRowCount, SourceTableSize int64
   496  }
   497  
   498  // CopyProgress stores the TableCopyProgress for all tables still being copied
   499  type CopyProgress map[string]*TableCopyProgress
   500  
   501  const (
   502  	cannotSwitchError               = "workflow has errors"
   503  	cannotSwitchCopyIncomplete      = "copy is still in progress"
   504  	cannotSwitchHighLag             = "replication lag %ds is higher than allowed lag %ds"
   505  	cannotSwitchFailedTabletRefresh = "could not refresh all of the tablets involved in the operation:\n%s"
   506  	cannotSwitchFrozen              = "workflow is frozen"
   507  )
   508  
   509  func (vrw *VReplicationWorkflow) canSwitch(keyspace, workflowName string) (reason string, err error) {
   510  	ws, err := vrw.reloadState()
   511  	if err != nil {
   512  		return "", err
   513  	}
   514  	if vrw.params.Direction == workflow.DirectionForward && ws.WritesSwitched ||
   515  		vrw.params.Direction == workflow.DirectionBackward && !ws.WritesSwitched {
   516  		log.Infof("writes already switched no need to check lag")
   517  		return "", nil
   518  	}
   519  	log.Infof("state:%s, direction %d, switched %t", vrw.CachedState(), vrw.params.Direction, ws.WritesSwitched)
   520  	result, err := vrw.wr.getStreams(vrw.ctx, workflowName, keyspace)
   521  	if err != nil {
   522  		return "", err
   523  	}
   524  	for ksShard := range result.ShardStatuses {
   525  		statuses := result.ShardStatuses[ksShard].PrimaryReplicationStatuses
   526  		for _, st := range statuses {
   527  			switch st.State {
   528  			case "Copying":
   529  				return cannotSwitchCopyIncomplete, nil
   530  			case "Error":
   531  				return cannotSwitchError, nil
   532  			}
   533  		}
   534  	}
   535  	if result.Frozen {
   536  		return cannotSwitchFrozen, nil
   537  	}
   538  	if result.MaxVReplicationTransactionLag > vrw.params.MaxAllowedTransactionLagSeconds {
   539  		return fmt.Sprintf(cannotSwitchHighLag, result.MaxVReplicationTransactionLag, vrw.params.MaxAllowedTransactionLagSeconds), nil
   540  	}
   541  
   542  	// Ensure that the tablets on both sides are in good shape as we make this same call in the process
   543  	// and an error will cause us to backout
   544  	refreshErrors := strings.Builder{}
   545  	var m sync.Mutex
   546  	var wg sync.WaitGroup
   547  	rtbsCtx, cancel := context.WithTimeout(vrw.ctx, shardTabletRefreshTimeout)
   548  	defer cancel()
   549  	refreshTablets := func(shards []*topo.ShardInfo, stype string) {
   550  		defer wg.Done()
   551  		for _, si := range shards {
   552  			if partial, partialDetails, err := topotools.RefreshTabletsByShard(rtbsCtx, vrw.wr.ts, vrw.wr.tmc, si, nil, vrw.wr.Logger()); err != nil || partial {
   553  				m.Lock()
   554  				refreshErrors.WriteString(fmt.Sprintf("failed to successfully refresh all tablets in the %s/%s %s shard (%v):\n  %v\n",
   555  					si.Keyspace(), si.ShardName(), stype, err, partialDetails))
   556  				m.Unlock()
   557  			}
   558  		}
   559  	}
   560  	wg.Add(1)
   561  	go refreshTablets(vrw.ts.SourceShards(), "source")
   562  	wg.Add(1)
   563  	go refreshTablets(vrw.ts.TargetShards(), "target")
   564  	wg.Wait()
   565  	if refreshErrors.Len() > 0 {
   566  		return fmt.Sprintf(cannotSwitchFailedTabletRefresh, refreshErrors.String()), nil
   567  	}
   568  	return "", nil
   569  }
   570  
   571  // GetCopyProgress returns the progress of all tables being copied in the workflow
   572  func (vrw *VReplicationWorkflow) GetCopyProgress() (*CopyProgress, error) {
   573  	ctx := context.Background()
   574  	getTablesQuery := "select distinct table_name from _vt.copy_state cs, _vt.vreplication vr where vr.id = cs.vrepl_id and vr.id = %d"
   575  	getRowCountQuery := "select table_name, table_rows, data_length from information_schema.tables where table_schema = %s and table_name in (%s)"
   576  	tables := make(map[string]bool)
   577  	const MaxRows = 1000
   578  	sourcePrimaries := make(map[*topodatapb.TabletAlias]bool)
   579  	for _, target := range vrw.ts.targets {
   580  		for id, bls := range target.Sources {
   581  			query := fmt.Sprintf(getTablesQuery, id)
   582  			p3qr, err := vrw.wr.tmc.ExecuteFetchAsDba(ctx, target.GetPrimary().Tablet, true, &tabletmanagerdatapb.ExecuteFetchAsDbaRequest{
   583  				Query:   []byte(query),
   584  				MaxRows: MaxRows,
   585  			})
   586  			if err != nil {
   587  				return nil, err
   588  			}
   589  			if len(p3qr.Rows) < 1 {
   590  				continue
   591  			}
   592  			qr := sqltypes.Proto3ToResult(p3qr)
   593  			for i := 0; i < len(p3qr.Rows); i++ {
   594  				tables[qr.Rows[i][0].ToString()] = true
   595  			}
   596  			sourcesi, err := vrw.wr.ts.GetShard(ctx, bls.Keyspace, bls.Shard)
   597  			if err != nil {
   598  				return nil, err
   599  			}
   600  			found := false
   601  			for existingSource := range sourcePrimaries {
   602  				if existingSource.Uid == sourcesi.PrimaryAlias.Uid {
   603  					found = true
   604  				}
   605  			}
   606  			if !found {
   607  				sourcePrimaries[sourcesi.PrimaryAlias] = true
   608  			}
   609  		}
   610  	}
   611  	if len(tables) == 0 {
   612  		return nil, nil
   613  	}
   614  	var tableList []string
   615  	targetRowCounts := make(map[string]int64)
   616  	sourceRowCounts := make(map[string]int64)
   617  	targetTableSizes := make(map[string]int64)
   618  	sourceTableSizes := make(map[string]int64)
   619  
   620  	for table := range tables {
   621  		tableList = append(tableList, encodeString(table))
   622  		targetRowCounts[table] = 0
   623  		sourceRowCounts[table] = 0
   624  		targetTableSizes[table] = 0
   625  		sourceTableSizes[table] = 0
   626  	}
   627  
   628  	var getTableMetrics = func(tablet *topodatapb.Tablet, query string, rowCounts *map[string]int64, tableSizes *map[string]int64) error {
   629  		p3qr, err := vrw.wr.tmc.ExecuteFetchAsDba(ctx, tablet, true, &tabletmanagerdatapb.ExecuteFetchAsDbaRequest{
   630  			Query:   []byte(query),
   631  			MaxRows: uint64(len(tables)),
   632  		})
   633  		if err != nil {
   634  			return err
   635  		}
   636  		qr := sqltypes.Proto3ToResult(p3qr)
   637  		for i := 0; i < len(qr.Rows); i++ {
   638  			table := qr.Rows[i][0].ToString()
   639  			rowCount, err := evalengine.ToInt64(qr.Rows[i][1])
   640  			if err != nil {
   641  				return err
   642  			}
   643  			tableSize, err := evalengine.ToInt64(qr.Rows[i][2])
   644  			if err != nil {
   645  				return err
   646  			}
   647  			(*rowCounts)[table] += rowCount
   648  			(*tableSizes)[table] += tableSize
   649  		}
   650  		return nil
   651  	}
   652  	sourceDbName := ""
   653  	for _, tsSource := range vrw.ts.sources {
   654  		sourceDbName = tsSource.GetPrimary().DbName()
   655  		break
   656  	}
   657  	if sourceDbName == "" {
   658  		return nil, fmt.Errorf("no sources found for workflow %s.%s", vrw.ws.TargetKeyspace, vrw.ws.Workflow)
   659  	}
   660  	targetDbName := ""
   661  	for _, tsTarget := range vrw.ts.targets {
   662  		targetDbName = tsTarget.GetPrimary().DbName()
   663  		break
   664  	}
   665  	if sourceDbName == "" || targetDbName == "" {
   666  		return nil, fmt.Errorf("workflow %s.%s is incorrectly configured", vrw.ws.TargetKeyspace, vrw.ws.Workflow)
   667  	}
   668  	sort.Strings(tableList) // sort list for repeatability for mocking in tests
   669  	tablesStr := strings.Join(tableList, ",")
   670  	query := fmt.Sprintf(getRowCountQuery, encodeString(targetDbName), tablesStr)
   671  	for _, target := range vrw.ts.targets {
   672  		tablet := target.GetPrimary().Tablet
   673  		if err := getTableMetrics(tablet, query, &targetRowCounts, &targetTableSizes); err != nil {
   674  			return nil, err
   675  		}
   676  	}
   677  
   678  	query = fmt.Sprintf(getRowCountQuery, encodeString(sourceDbName), tablesStr)
   679  	for source := range sourcePrimaries {
   680  		ti, err := vrw.wr.ts.GetTablet(ctx, source)
   681  		tablet := ti.Tablet
   682  		if err != nil {
   683  			return nil, err
   684  		}
   685  		if err := getTableMetrics(tablet, query, &sourceRowCounts, &sourceTableSizes); err != nil {
   686  			return nil, err
   687  		}
   688  	}
   689  
   690  	copyProgress := CopyProgress{}
   691  	for table, rowCount := range targetRowCounts {
   692  		copyProgress[table] = &TableCopyProgress{
   693  			TargetRowCount:  rowCount,
   694  			TargetTableSize: targetTableSizes[table],
   695  			SourceRowCount:  sourceRowCounts[table],
   696  			SourceTableSize: sourceTableSizes[table],
   697  		}
   698  	}
   699  	return &copyProgress, nil
   700  }
   701  
   702  // endregion
   703  
   704  // region Workflow related utility functions
   705  
   706  // deleteWorkflowVDiffData cleans up any potential VDiff related data associated with the workflow on the given tablet
   707  func (wr *Wrangler) deleteWorkflowVDiffData(ctx context.Context, tablet *topodatapb.Tablet, workflow string) {
   708  	sqlDeleteVDiffs := `delete from vd, vdt, vdl using _vt.vdiff as vd inner join _vt.vdiff_table as vdt on (vd.id = vdt.vdiff_id)
   709  						inner join _vt.vdiff_log as vdl on (vd.id = vdl.vdiff_id)
   710  						where vd.keyspace = %s and vd.workflow = %s`
   711  	query := fmt.Sprintf(sqlDeleteVDiffs, encodeString(tablet.Keyspace), encodeString(workflow))
   712  	rows := -1
   713  	if _, err := wr.tmc.ExecuteFetchAsDba(ctx, tablet, false, &tabletmanagerdatapb.ExecuteFetchAsDbaRequest{
   714  		Query:   []byte(query),
   715  		MaxRows: uint64(rows),
   716  	}); err != nil {
   717  		if sqlErr, ok := err.(*mysql.SQLError); ok && sqlErr.Num != mysql.ERNoSuchTable { // the tables may not exist if no vdiffs have been run
   718  			wr.Logger().Errorf("Error deleting vdiff data for %s.%s workflow: %v", tablet.Keyspace, workflow, err)
   719  		}
   720  	}
   721  }
   722  
   723  // optimizeCopyStateTable rebuilds the copy_state table to ensure the on-disk
   724  // structures are minimal and optimized and resets the auto-inc value for
   725  // subsequent inserts.
   726  // This helps to ensure that the size, storage, and performance related factors
   727  // for the table remain optimal over time and that we don't ever exhaust the
   728  // available auto-inc values for the table.
   729  // Note: it's not critical that this executes successfully any given time, it's
   730  // only important that we try to do this periodically so that things stay in an
   731  // optimal state over long periods of time. For this reason, the work is done
   732  // asynchronously in the background on the given tablet and any failures are
   733  // logged as warnings. Because it's done in the background we use the AllPrivs
   734  // account to be sure that we don't execute the writes if READ_ONLY is set on
   735  // the MySQL instance.
   736  func (wr *Wrangler) optimizeCopyStateTable(tablet *topodatapb.Tablet) {
   737  	if wr.sem != nil {
   738  		if !wr.sem.TryAcquire() {
   739  			log.Warningf("Deferring work to optimize the copy_state table on %q due to hitting the maximum concurrent background job limit.",
   740  				tablet.Alias.String())
   741  			return
   742  		}
   743  	}
   744  	go func() {
   745  		defer func() {
   746  			if wr.sem != nil {
   747  				wr.sem.Release()
   748  			}
   749  		}()
   750  		ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute)
   751  		defer cancel()
   752  		sqlOptimizeTable := "optimize table _vt.copy_state"
   753  		if _, err := wr.tmc.ExecuteFetchAsAllPrivs(ctx, tablet, &tabletmanagerdatapb.ExecuteFetchAsAllPrivsRequest{
   754  			Query:   []byte(sqlOptimizeTable),
   755  			MaxRows: uint64(100), // always produces 1+rows with notes and status
   756  		}); err != nil {
   757  			if sqlErr, ok := err.(*mysql.SQLError); ok && sqlErr.Num == mysql.ERNoSuchTable { // the table may not exist
   758  				return
   759  			}
   760  			log.Warningf("Failed to optimize the copy_state table on %q: %v", tablet.Alias.String(), err)
   761  		}
   762  		// This will automatically set the value to 1 or the current max value in the table, whichever is greater
   763  		sqlResetAutoInc := "alter table _vt.copy_state auto_increment = 1"
   764  		if _, err := wr.tmc.ExecuteFetchAsAllPrivs(ctx, tablet, &tabletmanagerdatapb.ExecuteFetchAsAllPrivsRequest{
   765  			Query:   []byte(sqlResetAutoInc),
   766  			MaxRows: uint64(0),
   767  		}); err != nil {
   768  			log.Warningf("Failed to reset the auto_increment value for the copy_state table on %q: %v",
   769  				tablet.Alias.String(), err)
   770  		}
   771  	}()
   772  }
   773  
   774  // endregion