vitess.io/vitess@v0.16.2/go/vt/wrangler/traffic_switcher.go (about)

     1  /*
     2  Copyright 2019 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package wrangler
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"reflect"
    24  	"sort"
    25  	"strings"
    26  	"sync"
    27  	"time"
    28  
    29  	"vitess.io/vitess/go/sqlescape"
    30  	"vitess.io/vitess/go/vt/discovery"
    31  
    32  	"vitess.io/vitess/go/json2"
    33  	"vitess.io/vitess/go/vt/binlog/binlogplayer"
    34  	"vitess.io/vitess/go/vt/concurrency"
    35  	"vitess.io/vitess/go/vt/key"
    36  	"vitess.io/vitess/go/vt/log"
    37  	"vitess.io/vitess/go/vt/logutil"
    38  	"vitess.io/vitess/go/vt/sqlparser"
    39  	"vitess.io/vitess/go/vt/topo"
    40  	"vitess.io/vitess/go/vt/topotools"
    41  	"vitess.io/vitess/go/vt/vtctl/workflow"
    42  	"vitess.io/vitess/go/vt/vterrors"
    43  	"vitess.io/vitess/go/vt/vtgate/vindexes"
    44  	"vitess.io/vitess/go/vt/vttablet/tabletmanager/vreplication"
    45  	"vitess.io/vitess/go/vt/vttablet/tmclient"
    46  
    47  	binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata"
    48  	querypb "vitess.io/vitess/go/vt/proto/query"
    49  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    50  	vschemapb "vitess.io/vitess/go/vt/proto/vschema"
    51  	vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc"
    52  )
    53  
    54  const (
    55  	errorNoStreams = "no streams found in keyspace %s for: %s"
    56  	// use pt-osc's naming convention, this format also ensures vstreamer ignores such tables
    57  	renameTableTemplate = "_%.59s_old" // limit table name to 64 characters
    58  
    59  	sqlDeleteWorkflow = "delete from _vt.vreplication where db_name = %s and workflow = %s"
    60  )
    61  
    62  // accessType specifies the type of access for a shard (allow/disallow writes).
    63  type accessType int
    64  
    65  const (
    66  	allowWrites = accessType(iota)
    67  	disallowWrites
    68  
    69  	// number of LOCK TABLES cycles to perform on the sources during SwitchWrites
    70  	lockTablesCycles = 2
    71  	// time to wait between LOCK TABLES cycles on the sources during SwitchWrites
    72  	lockTablesCycleDelay = time.Duration(100 * time.Millisecond)
    73  
    74  	// How long to wait when refreshing the state of each tablet in a shard. Note that these
    75  	// are refreshed in parallel, non-topo errors are ignored (in the error handling) and we
    76  	// may only do a partial refresh. Because in some cases it's unsafe to switch the traffic
    77  	// if some tablets do not refresh, we may need to look for partial results and produce
    78  	// an error (with the provided details of WHY) if we see them.
    79  	// Side note: the default lock/lease TTL in etcd is 60s so the default tablet refresh
    80  	// timeout of 60s can cause us to lose our keyspace lock before completing the
    81  	// operation too.
    82  	shardTabletRefreshTimeout = time.Duration(30 * time.Second)
    83  )
    84  
    85  // trafficSwitcher contains the metadata for switching read and write traffic
    86  // for vreplication streams.
    87  type trafficSwitcher struct {
    88  	migrationType      binlogdatapb.MigrationType
    89  	isPartialMigration bool
    90  	wr                 *Wrangler
    91  	workflow           string
    92  
    93  	// if frozen is true, the rest of the fields are not set.
    94  	frozen           bool
    95  	reverseWorkflow  string
    96  	id               int64
    97  	sources          map[string]*workflow.MigrationSource
    98  	targets          map[string]*workflow.MigrationTarget
    99  	sourceKeyspace   string
   100  	targetKeyspace   string
   101  	tables           []string
   102  	keepRoutingRules bool
   103  	sourceKSSchema   *vindexes.KeyspaceSchema
   104  	optCells         string //cells option passed to MoveTables/Reshard
   105  	optTabletTypes   string //tabletTypes option passed to MoveTables/Reshard
   106  	externalCluster  string
   107  	externalTopo     *topo.Server
   108  	sourceTimeZone   string
   109  	targetTimeZone   string
   110  	workflowType     binlogdatapb.VReplicationWorkflowType
   111  	workflowSubType  binlogdatapb.VReplicationWorkflowSubType
   112  }
   113  
   114  /*
   115  begin: implementation of workflow.ITrafficSwitcher
   116  
   117  (NOTE:@ajm188) Please see comments on that interface type for why this exists.
   118  This is temporary to allow workflow.StreamMigrator to use this trafficSwitcher
   119  code and should be removed in the very near-term when we move trafficSwitcher to
   120  package workflow as well.
   121  */
   122  
   123  var _ workflow.ITrafficSwitcher = (*trafficSwitcher)(nil)
   124  
   125  func (ts *trafficSwitcher) TopoServer() *topo.Server                          { return ts.wr.ts }
   126  func (ts *trafficSwitcher) TabletManagerClient() tmclient.TabletManagerClient { return ts.wr.tmc }
   127  func (ts *trafficSwitcher) Logger() logutil.Logger                            { return ts.wr.logger }
   128  func (ts *trafficSwitcher) VReplicationExec(ctx context.Context, alias *topodatapb.TabletAlias, query string) (*querypb.QueryResult, error) {
   129  	return ts.wr.VReplicationExec(ctx, alias, query)
   130  }
   131  
   132  func (ts *trafficSwitcher) ExternalTopo() *topo.Server                     { return ts.externalTopo }
   133  func (ts *trafficSwitcher) MigrationType() binlogdatapb.MigrationType      { return ts.migrationType }
   134  func (ts *trafficSwitcher) IsPartialMigration() bool                       { return ts.isPartialMigration }
   135  func (ts *trafficSwitcher) ReverseWorkflowName() string                    { return ts.reverseWorkflow }
   136  func (ts *trafficSwitcher) SourceKeyspaceName() string                     { return ts.sourceKSSchema.Keyspace.Name }
   137  func (ts *trafficSwitcher) SourceKeyspaceSchema() *vindexes.KeyspaceSchema { return ts.sourceKSSchema }
   138  func (ts *trafficSwitcher) Sources() map[string]*workflow.MigrationSource  { return ts.sources }
   139  func (ts *trafficSwitcher) Tables() []string                               { return ts.tables }
   140  func (ts *trafficSwitcher) TargetKeyspaceName() string                     { return ts.targetKeyspace }
   141  func (ts *trafficSwitcher) Targets() map[string]*workflow.MigrationTarget  { return ts.targets }
   142  func (ts *trafficSwitcher) WorkflowName() string                           { return ts.workflow }
   143  func (ts *trafficSwitcher) SourceTimeZone() string                         { return ts.sourceTimeZone }
   144  func (ts *trafficSwitcher) TargetTimeZone() string                         { return ts.targetTimeZone }
   145  
   146  func (ts *trafficSwitcher) ForAllSources(f func(source *workflow.MigrationSource) error) error {
   147  	var wg sync.WaitGroup
   148  	allErrors := &concurrency.AllErrorRecorder{}
   149  	for _, source := range ts.sources {
   150  		wg.Add(1)
   151  		go func(source *workflow.MigrationSource) {
   152  			defer wg.Done()
   153  
   154  			if err := f(source); err != nil {
   155  				allErrors.RecordError(err)
   156  			}
   157  		}(source)
   158  	}
   159  	wg.Wait()
   160  	return allErrors.AggrError(vterrors.Aggregate)
   161  }
   162  
   163  func (ts *trafficSwitcher) ForAllTargets(f func(source *workflow.MigrationTarget) error) error {
   164  	var wg sync.WaitGroup
   165  	allErrors := &concurrency.AllErrorRecorder{}
   166  	for _, target := range ts.targets {
   167  		wg.Add(1)
   168  		go func(target *workflow.MigrationTarget) {
   169  			defer wg.Done()
   170  
   171  			if err := f(target); err != nil {
   172  				allErrors.RecordError(err)
   173  			}
   174  		}(target)
   175  	}
   176  	wg.Wait()
   177  	return allErrors.AggrError(vterrors.Aggregate)
   178  }
   179  
   180  func (ts *trafficSwitcher) ForAllUIDs(f func(target *workflow.MigrationTarget, uid uint32) error) error {
   181  	var wg sync.WaitGroup
   182  	allErrors := &concurrency.AllErrorRecorder{}
   183  	for _, target := range ts.Targets() {
   184  		for uid := range target.Sources {
   185  			wg.Add(1)
   186  			go func(target *workflow.MigrationTarget, uid uint32) {
   187  				defer wg.Done()
   188  
   189  				if err := f(target, uid); err != nil {
   190  					allErrors.RecordError(err)
   191  				}
   192  			}(target, uid)
   193  		}
   194  	}
   195  	wg.Wait()
   196  	return allErrors.AggrError(vterrors.Aggregate)
   197  }
   198  
   199  /* end: implementation of workflow.ITrafficSwitcher */
   200  
   201  func (wr *Wrangler) getWorkflowState(ctx context.Context, targetKeyspace, workflowName string) (*trafficSwitcher, *workflow.State, error) {
   202  	ts, err := wr.buildTrafficSwitcher(ctx, targetKeyspace, workflowName)
   203  
   204  	if ts == nil || err != nil {
   205  		if errors.Is(err, workflow.ErrNoStreams) || err.Error() == fmt.Sprintf(errorNoStreams, targetKeyspace, workflowName) {
   206  			return nil, nil, nil
   207  		}
   208  		wr.Logger().Errorf("buildTrafficSwitcher failed: %v", err)
   209  		return nil, nil, err
   210  	}
   211  
   212  	ws := workflow.NewServer(wr.ts, wr.tmc)
   213  	state := &workflow.State{
   214  		Workflow:           workflowName,
   215  		SourceKeyspace:     ts.SourceKeyspaceName(),
   216  		TargetKeyspace:     targetKeyspace,
   217  		IsPartialMigration: ts.isPartialMigration,
   218  	}
   219  
   220  	var (
   221  		reverse  bool
   222  		keyspace string
   223  	)
   224  
   225  	// We reverse writes by using the source_keyspace.workflowname_reverse workflow
   226  	// spec, so we need to use the source of the reverse workflow, which is the
   227  	// target of the workflow initiated by the user for checking routing rules.
   228  	// Similarly we use a target shard of the reverse workflow as the original
   229  	// source to check if writes have been switched.
   230  	if strings.HasSuffix(workflowName, "_reverse") {
   231  		reverse = true
   232  		keyspace = state.SourceKeyspace
   233  		workflowName = workflow.ReverseWorkflowName(workflowName)
   234  	} else {
   235  		keyspace = targetKeyspace
   236  	}
   237  	if ts.MigrationType() == binlogdatapb.MigrationType_TABLES {
   238  		state.WorkflowType = workflow.TypeMoveTables
   239  
   240  		// We assume a consistent state, so only choose routing rule for one table.
   241  		if len(ts.Tables()) == 0 {
   242  			return nil, nil, fmt.Errorf("no tables in workflow %s.%s", keyspace, workflowName)
   243  
   244  		}
   245  		table := ts.Tables()[0]
   246  
   247  		if ts.isPartialMigration { // shard level traffic switching is all or nothing
   248  			shardRoutingRules, err := wr.ts.GetShardRoutingRules(ctx)
   249  			if err != nil {
   250  				return nil, nil, err
   251  			}
   252  
   253  			rules := shardRoutingRules.Rules
   254  			for _, rule := range rules {
   255  				if rule.ToKeyspace == ts.SourceKeyspaceName() {
   256  					state.ShardsNotYetSwitched = append(state.ShardsNotYetSwitched, rule.Shard)
   257  				} else {
   258  					state.ShardsAlreadySwitched = append(state.ShardsAlreadySwitched, rule.Shard)
   259  				}
   260  			}
   261  		} else {
   262  			state.RdonlyCellsSwitched, state.RdonlyCellsNotSwitched, err = ws.GetCellsWithTableReadsSwitched(ctx, keyspace, table, topodatapb.TabletType_RDONLY)
   263  			if err != nil {
   264  				return nil, nil, err
   265  			}
   266  
   267  			state.ReplicaCellsSwitched, state.ReplicaCellsNotSwitched, err = ws.GetCellsWithTableReadsSwitched(ctx, keyspace, table, topodatapb.TabletType_REPLICA)
   268  			if err != nil {
   269  				return nil, nil, err
   270  			}
   271  			globalRules, err := topotools.GetRoutingRules(ctx, ts.TopoServer())
   272  			if err != nil {
   273  				return nil, nil, err
   274  			}
   275  			for _, table := range ts.Tables() {
   276  				rr := globalRules[table]
   277  				// if a rule exists for the table and points to the target keyspace, writes have been switched
   278  				if len(rr) > 0 && rr[0] == fmt.Sprintf("%s.%s", keyspace, table) {
   279  					state.WritesSwitched = true
   280  					break
   281  				}
   282  			}
   283  		}
   284  	} else {
   285  		state.WorkflowType = workflow.TypeReshard
   286  
   287  		// we assume a consistent state, so only choose one shard
   288  		var shard *topo.ShardInfo
   289  		if reverse {
   290  			shard = ts.TargetShards()[0]
   291  		} else {
   292  			shard = ts.SourceShards()[0]
   293  		}
   294  
   295  		state.RdonlyCellsSwitched, state.RdonlyCellsNotSwitched, err = ws.GetCellsWithShardReadsSwitched(ctx, keyspace, shard, topodatapb.TabletType_RDONLY)
   296  		if err != nil {
   297  			return nil, nil, err
   298  		}
   299  
   300  		state.ReplicaCellsSwitched, state.ReplicaCellsNotSwitched, err = ws.GetCellsWithShardReadsSwitched(ctx, keyspace, shard, topodatapb.TabletType_REPLICA)
   301  		if err != nil {
   302  			return nil, nil, err
   303  		}
   304  
   305  		if !shard.IsPrimaryServing {
   306  			state.WritesSwitched = true
   307  		}
   308  	}
   309  
   310  	return ts, state, nil
   311  }
   312  
   313  // SwitchReads is a generic way of switching read traffic for a resharding workflow.
   314  func (wr *Wrangler) SwitchReads(ctx context.Context, targetKeyspace, workflowName string, servedTypes []topodatapb.TabletType,
   315  	cells []string, direction workflow.TrafficSwitchDirection, dryRun bool) (*[]string, error) {
   316  
   317  	ts, ws, err := wr.getWorkflowState(ctx, targetKeyspace, workflowName)
   318  	if err != nil {
   319  		wr.Logger().Errorf("getWorkflowState failed: %v", err)
   320  		return nil, err
   321  	}
   322  	if ts == nil {
   323  		errorMsg := fmt.Sprintf("workflow %s not found in keyspace %s", workflowName, targetKeyspace)
   324  		wr.Logger().Errorf(errorMsg)
   325  		return nil, fmt.Errorf(errorMsg)
   326  	}
   327  	log.Infof("Switching reads: %s.%s tt %+v, cells %+v, workflow state: %+v", targetKeyspace, workflowName, servedTypes, cells, ws)
   328  	var switchReplicas, switchRdonly bool
   329  	for _, servedType := range servedTypes {
   330  		if servedType != topodatapb.TabletType_REPLICA && servedType != topodatapb.TabletType_RDONLY {
   331  			return nil, fmt.Errorf("tablet type must be REPLICA or RDONLY: %v", servedType)
   332  		}
   333  		if direction == workflow.DirectionBackward && servedType == topodatapb.TabletType_REPLICA && len(ws.ReplicaCellsSwitched) == 0 {
   334  			return nil, fmt.Errorf("requesting reversal of read traffic for REPLICAs but REPLICA reads have not been switched")
   335  		}
   336  		if direction == workflow.DirectionBackward && servedType == topodatapb.TabletType_RDONLY && len(ws.RdonlyCellsSwitched) == 0 {
   337  			return nil, fmt.Errorf("requesting reversal of SwitchReads for RDONLYs but RDONLY reads have not been switched")
   338  		}
   339  		switch servedType {
   340  		case topodatapb.TabletType_REPLICA:
   341  			switchReplicas = true
   342  		case topodatapb.TabletType_RDONLY:
   343  			switchRdonly = true
   344  		}
   345  	}
   346  
   347  	// if there are no rdonly tablets in the cells ask to switch rdonly tablets as well so that routing rules
   348  	// are updated for rdonly as well. Otherwise vitess will not know that the workflow has completed and will
   349  	// incorrectly report that not all reads have been switched. User currently is forced to switch non-existent rdonly tablets
   350  	if switchReplicas && !switchRdonly {
   351  		var err error
   352  		rdonlyTabletsExist, err := topotools.DoCellsHaveRdonlyTablets(ctx, wr.ts, cells)
   353  		if err != nil {
   354  			return nil, err
   355  		}
   356  		if !rdonlyTabletsExist {
   357  			servedTypes = append(servedTypes, topodatapb.TabletType_RDONLY)
   358  		}
   359  	}
   360  
   361  	// If journals exist notify user and fail
   362  	journalsExist, _, err := ts.checkJournals(ctx)
   363  	if err != nil {
   364  		wr.Logger().Errorf("checkJournals failed: %v", err)
   365  		return nil, err
   366  	}
   367  	if journalsExist {
   368  		log.Infof("Found a previous journal entry for %d", ts.id)
   369  	}
   370  	var sw iswitcher
   371  	if dryRun {
   372  		sw = &switcherDryRun{ts: ts, drLog: NewLogRecorder()}
   373  	} else {
   374  		sw = &switcher{ts: ts, wr: wr}
   375  	}
   376  
   377  	if err := ts.validate(ctx); err != nil {
   378  		ts.Logger().Errorf("validate failed: %v", err)
   379  		return nil, err
   380  	}
   381  
   382  	// For reads, locking the source keyspace is sufficient.
   383  	ctx, unlock, lockErr := sw.lockKeyspace(ctx, ts.SourceKeyspaceName(), "SwitchReads")
   384  	if lockErr != nil {
   385  		ts.Logger().Errorf("LockKeyspace failed: %v", lockErr)
   386  		return nil, lockErr
   387  	}
   388  	defer unlock(&err)
   389  
   390  	if ts.MigrationType() == binlogdatapb.MigrationType_TABLES {
   391  		if ts.isPartialMigration {
   392  			ts.Logger().Infof("Partial migration, skipping switchTableReads as traffic is all or nothing per shard and overridden for reads AND writes in the ShardRoutingRule created when switching writes.")
   393  		} else if err := sw.switchTableReads(ctx, cells, servedTypes, direction); err != nil {
   394  			ts.Logger().Errorf("switchTableReads failed: %v", err)
   395  			return nil, err
   396  		}
   397  		return sw.logs(), nil
   398  	}
   399  	wr.Logger().Infof("About to switchShardReads: %+v, %+v, %+v", cells, servedTypes, direction)
   400  	if err := ts.switchShardReads(ctx, cells, servedTypes, direction); err != nil {
   401  		ts.Logger().Errorf("switchShardReads failed: %v", err)
   402  		return nil, err
   403  	}
   404  
   405  	wr.Logger().Infof("switchShardReads Completed: %+v, %+v, %+v", cells, servedTypes, direction)
   406  	if err := wr.ts.ValidateSrvKeyspace(ctx, targetKeyspace, strings.Join(cells, ",")); err != nil {
   407  		err2 := vterrors.Wrapf(err, "After switching shard reads, found SrvKeyspace for %s is corrupt in cell %s",
   408  			targetKeyspace, strings.Join(cells, ","))
   409  		log.Errorf("%w", err2)
   410  		return nil, err2
   411  	}
   412  	return sw.logs(), nil
   413  }
   414  
   415  func (wr *Wrangler) areTabletsAvailableToStreamFrom(ctx context.Context, ts *trafficSwitcher, keyspace string, shards []*topo.ShardInfo) error {
   416  	var cells []string
   417  	tabletTypes := ts.optTabletTypes
   418  	if ts.optCells != "" {
   419  		cells = strings.Split(ts.optCells, ",")
   420  	}
   421  	// FIXME: currently there is a default setting in the tablet that is used if user does not specify a tablet type,
   422  	// we use the value specified in the tablet flag `-vreplication_tablet_type`
   423  	// but ideally we should populate the vreplication table with a default value when we setup the workflow
   424  	if tabletTypes == "" {
   425  		tabletTypes = "PRIMARY,REPLICA"
   426  	}
   427  
   428  	var wg sync.WaitGroup
   429  	allErrors := &concurrency.AllErrorRecorder{}
   430  	for _, shard := range shards {
   431  		wg.Add(1)
   432  		go func(cells []string, keyspace string, shard *topo.ShardInfo) {
   433  			defer wg.Done()
   434  			if cells == nil {
   435  				cells = append(cells, shard.PrimaryAlias.Cell)
   436  			}
   437  			tp, err := discovery.NewTabletPicker(wr.ts, cells, keyspace, shard.ShardName(), tabletTypes)
   438  			if err != nil {
   439  				allErrors.RecordError(err)
   440  				return
   441  			}
   442  			tablets := tp.GetMatchingTablets(ctx)
   443  			if len(tablets) == 0 {
   444  				allErrors.RecordError(fmt.Errorf("no tablet found to source data in keyspace %s, shard %s", keyspace, shard.ShardName()))
   445  				return
   446  			}
   447  		}(cells, keyspace, shard)
   448  	}
   449  
   450  	wg.Wait()
   451  	if allErrors.HasErrors() {
   452  		log.Errorf("%s", allErrors.Error())
   453  		return allErrors.Error()
   454  	}
   455  	return nil
   456  }
   457  
   458  // SwitchWrites is a generic way of migrating write traffic for a resharding workflow.
   459  func (wr *Wrangler) SwitchWrites(ctx context.Context, targetKeyspace, workflowName string, timeout time.Duration,
   460  	cancel, reverse, reverseReplication bool, dryRun bool) (journalID int64, dryRunResults *[]string, err error) {
   461  	ts, ws, err := wr.getWorkflowState(ctx, targetKeyspace, workflowName)
   462  	_ = ws
   463  	if err != nil {
   464  		wr.Logger().Errorf("getWorkflowState failed: %v", err)
   465  		return 0, nil, err
   466  	}
   467  	if ts == nil {
   468  		errorMsg := fmt.Sprintf("workflow %s not found in keyspace %s", workflowName, targetKeyspace)
   469  		wr.Logger().Errorf(errorMsg)
   470  		return 0, nil, fmt.Errorf(errorMsg)
   471  	}
   472  
   473  	var sw iswitcher
   474  	if dryRun {
   475  		sw = &switcherDryRun{ts: ts, drLog: NewLogRecorder()}
   476  	} else {
   477  		sw = &switcher{ts: ts, wr: wr}
   478  	}
   479  
   480  	if ts.frozen {
   481  		ts.Logger().Warningf("Writes have already been switched for workflow %s, nothing to do here", ts.WorkflowName())
   482  		return 0, sw.logs(), nil
   483  	}
   484  
   485  	ts.Logger().Infof("Built switching metadata: %+v", ts)
   486  	if err := ts.validate(ctx); err != nil {
   487  		ts.Logger().Errorf("validate failed: %v", err)
   488  		return 0, nil, err
   489  	}
   490  
   491  	if reverseReplication {
   492  		err := wr.areTabletsAvailableToStreamFrom(ctx, ts, ts.TargetKeyspaceName(), ts.TargetShards())
   493  		if err != nil {
   494  			return 0, nil, err
   495  		}
   496  	}
   497  
   498  	// Need to lock both source and target keyspaces.
   499  	tctx, sourceUnlock, lockErr := sw.lockKeyspace(ctx, ts.SourceKeyspaceName(), "SwitchWrites")
   500  	if lockErr != nil {
   501  		ts.Logger().Errorf("LockKeyspace failed: %v", lockErr)
   502  		return 0, nil, lockErr
   503  	}
   504  	ctx = tctx
   505  	defer sourceUnlock(&err)
   506  	if ts.TargetKeyspaceName() != ts.SourceKeyspaceName() {
   507  		tctx, targetUnlock, lockErr := sw.lockKeyspace(ctx, ts.TargetKeyspaceName(), "SwitchWrites")
   508  		if lockErr != nil {
   509  			ts.Logger().Errorf("LockKeyspace failed: %v", lockErr)
   510  			return 0, nil, lockErr
   511  		}
   512  		ctx = tctx
   513  		defer targetUnlock(&err)
   514  	}
   515  
   516  	// If no journals exist, sourceWorkflows will be initialized by sm.MigrateStreams.
   517  	journalsExist, sourceWorkflows, err := ts.checkJournals(ctx)
   518  	if err != nil {
   519  		ts.Logger().Errorf("checkJournals failed: %v", err)
   520  		return 0, nil, err
   521  	}
   522  	if !journalsExist {
   523  		ts.Logger().Infof("No previous journals were found. Proceeding normally.")
   524  		sm, err := workflow.BuildStreamMigrator(ctx, ts, cancel)
   525  		if err != nil {
   526  			ts.Logger().Errorf("buildStreamMigrater failed: %v", err)
   527  			return 0, nil, err
   528  		}
   529  		if cancel {
   530  			sw.cancelMigration(ctx, sm)
   531  			return 0, sw.logs(), nil
   532  		}
   533  
   534  		ts.Logger().Infof("Stopping streams")
   535  		sourceWorkflows, err = sw.stopStreams(ctx, sm)
   536  		if err != nil {
   537  			ts.Logger().Errorf("stopStreams failed: %v", err)
   538  			for key, streams := range sm.Streams() {
   539  				for _, stream := range streams {
   540  					ts.Logger().Errorf("stream in stopStreams: key %s shard %s stream %+v", key, stream.BinlogSource.Shard, stream.BinlogSource)
   541  				}
   542  			}
   543  			sw.cancelMigration(ctx, sm)
   544  			return 0, nil, err
   545  		}
   546  
   547  		ts.Logger().Infof("Stopping source writes")
   548  		if err := sw.stopSourceWrites(ctx); err != nil {
   549  			ts.Logger().Errorf("stopSourceWrites failed: %v", err)
   550  			sw.cancelMigration(ctx, sm)
   551  			return 0, nil, err
   552  		}
   553  
   554  		if ts.MigrationType() == binlogdatapb.MigrationType_TABLES {
   555  			ts.Logger().Infof("Executing LOCK TABLES on source tables %d times", lockTablesCycles)
   556  			// Doing this twice with a pause in-between to catch any writes that may have raced in between
   557  			// the tablet's deny list check and the first mysqld side table lock.
   558  			for cnt := 1; cnt <= lockTablesCycles; cnt++ {
   559  				if err := ts.executeLockTablesOnSource(ctx); err != nil {
   560  					ts.Logger().Errorf("Failed to execute LOCK TABLES (attempt %d of %d) on sources: %v", cnt, lockTablesCycles, err)
   561  					sw.cancelMigration(ctx, sm)
   562  					return 0, nil, err
   563  				}
   564  				// No need to UNLOCK the tables as the connection was closed once the locks were acquired
   565  				// and thus the locks released.
   566  				time.Sleep(lockTablesCycleDelay)
   567  			}
   568  		}
   569  
   570  		ts.Logger().Infof("Waiting for streams to catchup")
   571  		if err := sw.waitForCatchup(ctx, timeout); err != nil {
   572  			ts.Logger().Errorf("waitForCatchup failed: %v", err)
   573  			sw.cancelMigration(ctx, sm)
   574  			return 0, nil, err
   575  		}
   576  
   577  		ts.Logger().Infof("Migrating streams")
   578  		if err := sw.migrateStreams(ctx, sm); err != nil {
   579  			ts.Logger().Errorf("migrateStreams failed: %v", err)
   580  			sw.cancelMigration(ctx, sm)
   581  			return 0, nil, err
   582  		}
   583  
   584  		ts.Logger().Infof("Creating reverse streams")
   585  		if err := sw.createReverseVReplication(ctx); err != nil {
   586  			ts.Logger().Errorf("createReverseVReplication failed: %v", err)
   587  			sw.cancelMigration(ctx, sm)
   588  			return 0, nil, err
   589  		}
   590  	} else {
   591  		if cancel {
   592  			err := fmt.Errorf("traffic switching has reached the point of no return, cannot cancel")
   593  			ts.Logger().Errorf("%v", err)
   594  			return 0, nil, err
   595  		}
   596  		ts.Logger().Infof("Journals were found. Completing the left over steps.")
   597  		// Need to gather positions in case all journals were not created.
   598  		if err := ts.gatherPositions(ctx); err != nil {
   599  			ts.Logger().Errorf("gatherPositions failed: %v", err)
   600  			return 0, nil, err
   601  		}
   602  	}
   603  
   604  	// This is the point of no return. Once a journal is created,
   605  	// traffic can be redirected to target shards.
   606  	if err := sw.createJournals(ctx, sourceWorkflows); err != nil {
   607  		ts.Logger().Errorf("createJournals failed: %v", err)
   608  		return 0, nil, err
   609  	}
   610  	if err := sw.allowTargetWrites(ctx); err != nil {
   611  		ts.Logger().Errorf("allowTargetWrites failed: %v", err)
   612  		return 0, nil, err
   613  	}
   614  	if err := sw.changeRouting(ctx); err != nil {
   615  		ts.Logger().Errorf("changeRouting failed: %v", err)
   616  		return 0, nil, err
   617  	}
   618  	if err := sw.streamMigraterfinalize(ctx, ts, sourceWorkflows); err != nil {
   619  		ts.Logger().Errorf("finalize failed: %v", err)
   620  		return 0, nil, err
   621  	}
   622  	if reverseReplication {
   623  		if err := sw.startReverseVReplication(ctx); err != nil {
   624  			ts.Logger().Errorf("startReverseVReplication failed: %v", err)
   625  			return 0, nil, err
   626  		}
   627  	}
   628  
   629  	if err := sw.freezeTargetVReplication(ctx); err != nil {
   630  		ts.Logger().Errorf("deleteTargetVReplication failed: %v", err)
   631  		return 0, nil, err
   632  	}
   633  
   634  	return ts.id, sw.logs(), nil
   635  }
   636  
   637  // DropTargets cleans up target tables, shards and denied tables if a MoveTables/Reshard is cancelled
   638  func (wr *Wrangler) DropTargets(ctx context.Context, targetKeyspace, workflow string, keepData, keepRoutingRules, dryRun bool) (*[]string, error) {
   639  	ts, err := wr.buildTrafficSwitcher(ctx, targetKeyspace, workflow)
   640  	if err != nil {
   641  		wr.Logger().Errorf("buildTrafficSwitcher failed: %v", err)
   642  		return nil, err
   643  	}
   644  	ts.keepRoutingRules = keepRoutingRules
   645  	var sw iswitcher
   646  	if dryRun {
   647  		sw = &switcherDryRun{ts: ts, drLog: NewLogRecorder()}
   648  	} else {
   649  		sw = &switcher{ts: ts, wr: wr}
   650  	}
   651  	var tctx context.Context
   652  	tctx, sourceUnlock, lockErr := sw.lockKeyspace(ctx, ts.SourceKeyspaceName(), "DropTargets")
   653  	if lockErr != nil {
   654  		ts.Logger().Errorf("Source LockKeyspace failed: %v", lockErr)
   655  		return nil, lockErr
   656  	}
   657  	defer sourceUnlock(&err)
   658  	ctx = tctx
   659  	if ts.TargetKeyspaceName() != ts.SourceKeyspaceName() {
   660  		tctx, targetUnlock, lockErr := sw.lockKeyspace(ctx, ts.TargetKeyspaceName(), "DropTargets")
   661  		if lockErr != nil {
   662  			ts.Logger().Errorf("Target LockKeyspace failed: %v", lockErr)
   663  			return nil, lockErr
   664  		}
   665  		defer targetUnlock(&err)
   666  		ctx = tctx
   667  	}
   668  	if !keepData {
   669  		switch ts.MigrationType() {
   670  		case binlogdatapb.MigrationType_TABLES:
   671  			log.Infof("Deleting target tables")
   672  			if err := sw.removeTargetTables(ctx); err != nil {
   673  				return nil, err
   674  			}
   675  			if err := sw.dropSourceDeniedTables(ctx); err != nil {
   676  				return nil, err
   677  			}
   678  		case binlogdatapb.MigrationType_SHARDS:
   679  			log.Infof("Removing target shards")
   680  			if err := sw.dropTargetShards(ctx); err != nil {
   681  				return nil, err
   682  			}
   683  		}
   684  	}
   685  	if err := wr.dropArtifacts(ctx, keepRoutingRules, sw); err != nil {
   686  		return nil, err
   687  	}
   688  	if err := ts.TopoServer().RebuildSrvVSchema(ctx, nil); err != nil {
   689  		return nil, err
   690  	}
   691  	return sw.logs(), nil
   692  }
   693  
   694  func (wr *Wrangler) dropArtifacts(ctx context.Context, keepRoutingRules bool, sw iswitcher) error {
   695  	if err := sw.dropSourceReverseVReplicationStreams(ctx); err != nil {
   696  		return err
   697  	}
   698  	if err := sw.dropTargetVReplicationStreams(ctx); err != nil {
   699  		return err
   700  	}
   701  	if !keepRoutingRules {
   702  		if err := sw.deleteRoutingRules(ctx); err != nil {
   703  			return err
   704  		}
   705  		if err := sw.deleteShardRoutingRules(ctx); err != nil {
   706  			return err
   707  		}
   708  	}
   709  
   710  	return nil
   711  }
   712  
   713  // finalizeMigrateWorkflow deletes the streams for the Migrate workflow.
   714  // We only cleanup the target for external sources
   715  func (wr *Wrangler) finalizeMigrateWorkflow(ctx context.Context, targetKeyspace, workflow, tableSpecs string,
   716  	cancel, keepData, keepRoutingRules, dryRun bool) (*[]string, error) {
   717  	ts, err := wr.buildTrafficSwitcher(ctx, targetKeyspace, workflow)
   718  	if err != nil {
   719  		wr.Logger().Errorf("buildTrafficSwitcher failed: %v", err)
   720  		return nil, err
   721  	}
   722  	var sw iswitcher
   723  	if dryRun {
   724  		sw = &switcherDryRun{ts: ts, drLog: NewLogRecorder()}
   725  	} else {
   726  		sw = &switcher{ts: ts, wr: wr}
   727  	}
   728  	var tctx context.Context
   729  	tctx, targetUnlock, lockErr := sw.lockKeyspace(ctx, ts.TargetKeyspaceName(), "completeMigrateWorkflow")
   730  	if lockErr != nil {
   731  		ts.Logger().Errorf("Target LockKeyspace failed: %v", lockErr)
   732  		return nil, lockErr
   733  	}
   734  	defer targetUnlock(&err)
   735  	ctx = tctx
   736  	if err := sw.dropTargetVReplicationStreams(ctx); err != nil {
   737  		return nil, err
   738  	}
   739  	if !cancel {
   740  		sw.addParticipatingTablesToKeyspace(ctx, targetKeyspace, tableSpecs)
   741  		if err := ts.TopoServer().RebuildSrvVSchema(ctx, nil); err != nil {
   742  			return nil, err
   743  		}
   744  	}
   745  	log.Infof("cancel is %t, keepData %t", cancel, keepData)
   746  	if cancel && !keepData {
   747  		if err := sw.removeTargetTables(ctx); err != nil {
   748  			return nil, err
   749  		}
   750  	}
   751  	return sw.logs(), nil
   752  }
   753  
   754  // DropSources cleans up source tables, shards and denied tables after a MoveTables/Reshard is completed
   755  func (wr *Wrangler) DropSources(ctx context.Context, targetKeyspace, workflowName string, removalType workflow.TableRemovalType, keepData, keepRoutingRules, force, dryRun bool) (*[]string, error) {
   756  	ts, err := wr.buildTrafficSwitcher(ctx, targetKeyspace, workflowName)
   757  	if err != nil {
   758  		wr.Logger().Errorf("buildTrafficSwitcher failed: %v", err)
   759  		return nil, err
   760  	}
   761  	var sw iswitcher
   762  	if dryRun {
   763  		sw = &switcherDryRun{ts: ts, drLog: NewLogRecorder()}
   764  	} else {
   765  		sw = &switcher{ts: ts, wr: wr}
   766  	}
   767  	var tctx context.Context
   768  	tctx, sourceUnlock, lockErr := sw.lockKeyspace(ctx, ts.SourceKeyspaceName(), "DropSources")
   769  	if lockErr != nil {
   770  		ts.Logger().Errorf("Source LockKeyspace failed: %v", lockErr)
   771  		return nil, lockErr
   772  	}
   773  	defer sourceUnlock(&err)
   774  	ctx = tctx
   775  	if ts.TargetKeyspaceName() != ts.SourceKeyspaceName() {
   776  		tctx, targetUnlock, lockErr := sw.lockKeyspace(ctx, ts.TargetKeyspaceName(), "DropSources")
   777  		if lockErr != nil {
   778  			ts.Logger().Errorf("Target LockKeyspace failed: %v", lockErr)
   779  			return nil, lockErr
   780  		}
   781  		defer targetUnlock(&err)
   782  		ctx = tctx
   783  	}
   784  	if !force {
   785  		if err := sw.validateWorkflowHasCompleted(ctx); err != nil {
   786  			wr.Logger().Errorf("Workflow has not completed, cannot DropSources: %v", err)
   787  			return nil, err
   788  		}
   789  	}
   790  	if !keepData {
   791  		switch ts.MigrationType() {
   792  		case binlogdatapb.MigrationType_TABLES:
   793  			log.Infof("Deleting tables")
   794  			if err := sw.removeSourceTables(ctx, removalType); err != nil {
   795  				return nil, err
   796  			}
   797  			if err := sw.dropSourceDeniedTables(ctx); err != nil {
   798  				return nil, err
   799  			}
   800  
   801  		case binlogdatapb.MigrationType_SHARDS:
   802  			log.Infof("Removing shards")
   803  			if err := sw.dropSourceShards(ctx); err != nil {
   804  				return nil, err
   805  			}
   806  		}
   807  	}
   808  	if err := wr.dropArtifacts(ctx, keepRoutingRules, sw); err != nil {
   809  		return nil, err
   810  	}
   811  	if err := ts.TopoServer().RebuildSrvVSchema(ctx, nil); err != nil {
   812  		return nil, err
   813  	}
   814  
   815  	return sw.logs(), nil
   816  }
   817  
   818  func (wr *Wrangler) buildTrafficSwitcher(ctx context.Context, targetKeyspace, workflowName string) (*trafficSwitcher, error) {
   819  	tgtInfo, err := workflow.BuildTargets(ctx, wr.ts, wr.tmc, targetKeyspace, workflowName)
   820  	if err != nil {
   821  		log.Infof("Error building targets: %s", err)
   822  		return nil, err
   823  	}
   824  	targets, frozen, optCells, optTabletTypes := tgtInfo.Targets, tgtInfo.Frozen, tgtInfo.OptCells, tgtInfo.OptTabletTypes
   825  
   826  	ts := &trafficSwitcher{
   827  		wr:              wr,
   828  		workflow:        workflowName,
   829  		reverseWorkflow: workflow.ReverseWorkflowName(workflowName),
   830  		id:              workflow.HashStreams(targetKeyspace, targets),
   831  		targets:         targets,
   832  		sources:         make(map[string]*workflow.MigrationSource),
   833  		targetKeyspace:  targetKeyspace,
   834  		frozen:          frozen,
   835  		optCells:        optCells,
   836  		optTabletTypes:  optTabletTypes,
   837  		workflowType:    tgtInfo.WorkflowType,
   838  		workflowSubType: tgtInfo.WorkflowSubType,
   839  	}
   840  	log.Infof("Migration ID for workflow %s: %d", workflowName, ts.id)
   841  	sourceTopo := wr.ts
   842  
   843  	// Build the sources
   844  	for _, target := range targets {
   845  		for _, bls := range target.Sources {
   846  			if ts.sourceKeyspace == "" {
   847  				ts.sourceKeyspace = bls.Keyspace
   848  				ts.sourceTimeZone = bls.SourceTimeZone
   849  				ts.targetTimeZone = bls.TargetTimeZone
   850  				ts.externalCluster = bls.ExternalCluster
   851  				if ts.externalCluster != "" {
   852  					externalTopo, err := wr.ts.OpenExternalVitessClusterServer(ctx, ts.externalCluster)
   853  					if err != nil {
   854  						return nil, err
   855  					}
   856  					sourceTopo = externalTopo
   857  					ts.externalTopo = externalTopo
   858  				}
   859  			} else if ts.sourceKeyspace != bls.Keyspace {
   860  				return nil, fmt.Errorf("source keyspaces are mismatched across streams: %v vs %v", ts.sourceKeyspace, bls.Keyspace)
   861  			}
   862  
   863  			if ts.tables == nil {
   864  				for _, rule := range bls.Filter.Rules {
   865  					ts.tables = append(ts.tables, rule.Match)
   866  				}
   867  				sort.Strings(ts.tables)
   868  			} else {
   869  				var tables []string
   870  				for _, rule := range bls.Filter.Rules {
   871  					tables = append(tables, rule.Match)
   872  				}
   873  				sort.Strings(tables)
   874  				if !reflect.DeepEqual(ts.tables, tables) {
   875  					return nil, fmt.Errorf("table lists are mismatched across streams: %v vs %v", ts.tables, tables)
   876  				}
   877  			}
   878  
   879  			if _, ok := ts.sources[bls.Shard]; ok {
   880  				continue
   881  			}
   882  			sourcesi, err := sourceTopo.GetShard(ctx, bls.Keyspace, bls.Shard)
   883  			if err != nil {
   884  				return nil, err
   885  			}
   886  			sourcePrimary, err := sourceTopo.GetTablet(ctx, sourcesi.PrimaryAlias)
   887  			if err != nil {
   888  				return nil, err
   889  			}
   890  			ts.sources[bls.Shard] = workflow.NewMigrationSource(sourcesi, sourcePrimary)
   891  		}
   892  	}
   893  	if ts.sourceKeyspace != ts.targetKeyspace || ts.externalCluster != "" {
   894  		ts.migrationType = binlogdatapb.MigrationType_TABLES
   895  	} else {
   896  		// TODO(sougou): for shard migration, validate that source and target combined
   897  		// keyranges match.
   898  		ts.migrationType = binlogdatapb.MigrationType_SHARDS
   899  		for sourceShard := range ts.sources {
   900  			if _, ok := ts.targets[sourceShard]; ok {
   901  				// If shards are overlapping, then this is a table migration.
   902  				ts.migrationType = binlogdatapb.MigrationType_TABLES
   903  				break
   904  			}
   905  		}
   906  	}
   907  	vs, err := sourceTopo.GetVSchema(ctx, ts.sourceKeyspace)
   908  	if err != nil {
   909  		return nil, err
   910  	}
   911  	ts.sourceKSSchema, err = vindexes.BuildKeyspaceSchema(vs, ts.sourceKeyspace)
   912  	if err != nil {
   913  		return nil, err
   914  	}
   915  
   916  	sourceShards, targetShards := ts.getSourceAndTargetShardsNames()
   917  
   918  	ts.isPartialMigration, err = ts.isPartialMoveTables(sourceShards, targetShards)
   919  	if err != nil {
   920  		return nil, err
   921  	}
   922  	if ts.isPartialMigration {
   923  		log.Infof("Migration is partial, for shards %+v", sourceShards)
   924  	}
   925  	return ts, nil
   926  }
   927  
   928  func (ts *trafficSwitcher) getSourceAndTargetShardsNames() ([]string, []string) {
   929  	var sourceShards, targetShards []string
   930  	for _, si := range ts.SourceShards() {
   931  		sourceShards = append(sourceShards, si.ShardName())
   932  	}
   933  	for _, si := range ts.TargetShards() {
   934  		targetShards = append(targetShards, si.ShardName())
   935  	}
   936  	return sourceShards, targetShards
   937  }
   938  
   939  // isPartialMoveTables returns true if whe workflow is MoveTables,
   940  // has the same number of shards, is not covering the entire shard range, and has one-to-one shards in source and target
   941  func (ts *trafficSwitcher) isPartialMoveTables(sourceShards, targetShards []string) (bool, error) {
   942  
   943  	if ts.MigrationType() != binlogdatapb.MigrationType_TABLES {
   944  		return false, nil
   945  	}
   946  
   947  	skr, tkr, err := getSourceAndTargetKeyRanges(sourceShards, targetShards)
   948  	if err != nil {
   949  		return false, err
   950  	}
   951  
   952  	if !key.KeyRangeIsPartial(skr) || !key.KeyRangeIsPartial(tkr) || // both cover full range
   953  		len(sourceShards) != len(targetShards) {
   954  
   955  		return false, nil
   956  	}
   957  
   958  	return key.KeyRangeEqual(skr, tkr), nil
   959  }
   960  
   961  func getSourceAndTargetKeyRanges(sourceShards, targetShards []string) (*topodatapb.KeyRange, *topodatapb.KeyRange, error) {
   962  	if len(sourceShards) == 0 || len(targetShards) == 0 {
   963  		return nil, nil, fmt.Errorf("either source or target shards are missing")
   964  	}
   965  
   966  	getKeyRange := func(shard string) (*topodatapb.KeyRange, error) {
   967  		krs, err := key.ParseShardingSpec(shard)
   968  		if err != nil {
   969  			return nil, err
   970  		}
   971  		return krs[0], nil
   972  	}
   973  
   974  	// happily string sorting of shards also sorts them in the ascending order of key ranges in vitess
   975  	sort.Strings(sourceShards)
   976  	sort.Strings(targetShards)
   977  	getFullKeyRange := func(shards []string) (*topodatapb.KeyRange, error) {
   978  		// expect sorted shards
   979  		kr1, err := getKeyRange(sourceShards[0])
   980  		if err != nil {
   981  			return nil, err
   982  		}
   983  		kr2, err := getKeyRange(sourceShards[len(sourceShards)-1])
   984  		if err != nil {
   985  			return nil, err
   986  		}
   987  		return &topodatapb.KeyRange{
   988  			Start: kr1.Start,
   989  			End:   kr2.End,
   990  		}, nil
   991  	}
   992  
   993  	skr, err := getFullKeyRange(sourceShards)
   994  	if err != nil {
   995  		return nil, nil, err
   996  	}
   997  	tkr, err := getFullKeyRange(targetShards)
   998  	if err != nil {
   999  		return nil, nil, err
  1000  	}
  1001  
  1002  	return skr, tkr, nil
  1003  }
  1004  
  1005  func (ts *trafficSwitcher) validate(ctx context.Context) error {
  1006  	if ts.MigrationType() == binlogdatapb.MigrationType_TABLES {
  1007  		if ts.isPartialMigration {
  1008  			return nil
  1009  		}
  1010  		sourceTopo := ts.wr.ts
  1011  		if ts.externalTopo != nil {
  1012  			sourceTopo = ts.externalTopo
  1013  		}
  1014  
  1015  		// All shards must be present.
  1016  		if err := workflow.CompareShards(ctx, ts.SourceKeyspaceName(), ts.SourceShards(), sourceTopo); err != nil {
  1017  			return err
  1018  		}
  1019  		if err := workflow.CompareShards(ctx, ts.TargetKeyspaceName(), ts.TargetShards(), ts.wr.ts); err != nil {
  1020  			return err
  1021  		}
  1022  		// Wildcard table names not allowed.
  1023  		for _, table := range ts.tables {
  1024  			if strings.HasPrefix(table, "/") {
  1025  				return fmt.Errorf("cannot migrate streams with wild card table names: %v", table)
  1026  			}
  1027  		}
  1028  	}
  1029  	return nil
  1030  }
  1031  
  1032  func (ts *trafficSwitcher) switchTableReads(ctx context.Context, cells []string, servedTypes []topodatapb.TabletType, direction workflow.TrafficSwitchDirection) error {
  1033  	log.Infof("switchTableReads: servedTypes: %+v, direction %t", servedTypes, direction)
  1034  	rules, err := topotools.GetRoutingRules(ctx, ts.TopoServer())
  1035  	if err != nil {
  1036  		return err
  1037  	}
  1038  	// We assume that the following rules were setup when the targets were created:
  1039  	// table -> sourceKeyspace.table
  1040  	// targetKeyspace.table -> sourceKeyspace.table
  1041  	// For forward migration, we add tablet type specific rules to redirect traffic to the target.
  1042  	// For backward, we redirect to source
  1043  	for _, servedType := range servedTypes {
  1044  		tt := strings.ToLower(servedType.String())
  1045  		for _, table := range ts.Tables() {
  1046  			if direction == workflow.DirectionForward {
  1047  				log.Infof("Route direction forward")
  1048  				toTarget := []string{ts.TargetKeyspaceName() + "." + table}
  1049  				rules[table+"@"+tt] = toTarget
  1050  				rules[ts.TargetKeyspaceName()+"."+table+"@"+tt] = toTarget
  1051  				rules[ts.SourceKeyspaceName()+"."+table+"@"+tt] = toTarget
  1052  			} else {
  1053  				log.Infof("Route direction backwards")
  1054  				toSource := []string{ts.SourceKeyspaceName() + "." + table}
  1055  				rules[table+"@"+tt] = toSource
  1056  				rules[ts.TargetKeyspaceName()+"."+table+"@"+tt] = toSource
  1057  				rules[ts.SourceKeyspaceName()+"."+table+"@"+tt] = toSource
  1058  			}
  1059  		}
  1060  	}
  1061  	if err := topotools.SaveRoutingRules(ctx, ts.TopoServer(), rules); err != nil {
  1062  		return err
  1063  	}
  1064  	return ts.TopoServer().RebuildSrvVSchema(ctx, cells)
  1065  }
  1066  
  1067  func (ts *trafficSwitcher) switchShardReads(ctx context.Context, cells []string, servedTypes []topodatapb.TabletType, direction workflow.TrafficSwitchDirection) error {
  1068  	var fromShards, toShards []*topo.ShardInfo
  1069  	if direction == workflow.DirectionForward {
  1070  		fromShards, toShards = ts.SourceShards(), ts.TargetShards()
  1071  	} else {
  1072  		fromShards, toShards = ts.TargetShards(), ts.SourceShards()
  1073  	}
  1074  	if err := ts.TopoServer().ValidateSrvKeyspace(ctx, ts.TargetKeyspaceName(), strings.Join(cells, ",")); err != nil {
  1075  		err2 := vterrors.Wrapf(err, "Before switching shard reads, found SrvKeyspace for %s is corrupt in cell %s",
  1076  			ts.TargetKeyspaceName(), strings.Join(cells, ","))
  1077  		log.Errorf("%w", err2)
  1078  		return err2
  1079  	}
  1080  	for _, servedType := range servedTypes {
  1081  		if err := ts.wr.updateShardRecords(ctx, ts.SourceKeyspaceName(), fromShards, cells, servedType, true /* isFrom */, false /* clearSourceShards */); err != nil {
  1082  			return err
  1083  		}
  1084  		if err := ts.wr.updateShardRecords(ctx, ts.SourceKeyspaceName(), toShards, cells, servedType, false, false); err != nil {
  1085  			return err
  1086  		}
  1087  		err := ts.TopoServer().MigrateServedType(ctx, ts.SourceKeyspaceName(), toShards, fromShards, servedType, cells)
  1088  		if err != nil {
  1089  			return err
  1090  		}
  1091  	}
  1092  	if err := ts.TopoServer().ValidateSrvKeyspace(ctx, ts.TargetKeyspaceName(), strings.Join(cells, ",")); err != nil {
  1093  		err2 := vterrors.Wrapf(err, "After switching shard reads, found SrvKeyspace for %s is corrupt in cell %s",
  1094  			ts.TargetKeyspaceName(), strings.Join(cells, ","))
  1095  		log.Errorf("%w", err2)
  1096  		return err2
  1097  	}
  1098  	return nil
  1099  }
  1100  
  1101  // checkJournals returns true if at least one journal has been created.
  1102  // If so, it also returns the list of sourceWorkflows that need to be switched.
  1103  func (ts *trafficSwitcher) checkJournals(ctx context.Context) (journalsExist bool, sourceWorkflows []string, err error) {
  1104  	var (
  1105  		ws = workflow.NewServer(ts.TopoServer(), ts.TabletManagerClient())
  1106  		mu sync.Mutex
  1107  	)
  1108  
  1109  	err = ts.ForAllSources(func(source *workflow.MigrationSource) error {
  1110  		mu.Lock()
  1111  		defer mu.Unlock()
  1112  		journal, exists, err := ws.CheckReshardingJournalExistsOnTablet(ctx, source.GetPrimary().Tablet, ts.id)
  1113  		if err != nil {
  1114  			return err
  1115  		}
  1116  		if exists {
  1117  			if journal.Id != 0 {
  1118  				sourceWorkflows = journal.SourceWorkflows
  1119  			}
  1120  			source.Journaled = true
  1121  			journalsExist = true
  1122  		}
  1123  		return nil
  1124  	})
  1125  	return journalsExist, sourceWorkflows, err
  1126  }
  1127  
  1128  func (ts *trafficSwitcher) stopSourceWrites(ctx context.Context) error {
  1129  	var err error
  1130  	if ts.MigrationType() == binlogdatapb.MigrationType_TABLES {
  1131  		err = ts.changeTableSourceWrites(ctx, disallowWrites)
  1132  	} else {
  1133  		err = ts.changeShardsAccess(ctx, ts.SourceKeyspaceName(), ts.SourceShards(), disallowWrites)
  1134  	}
  1135  	if err != nil {
  1136  		log.Warningf("Error: %s", err)
  1137  		return err
  1138  	}
  1139  	return ts.ForAllSources(func(source *workflow.MigrationSource) error {
  1140  		var err error
  1141  		source.Position, err = ts.TabletManagerClient().PrimaryPosition(ctx, source.GetPrimary().Tablet)
  1142  		ts.wr.Logger().Infof("Stopped Source Writes. Position for source %v:%v: %v",
  1143  			ts.SourceKeyspaceName(), source.GetShard().ShardName(), source.Position)
  1144  		if err != nil {
  1145  			log.Warningf("Error: %s", err)
  1146  		}
  1147  		return err
  1148  	})
  1149  }
  1150  
  1151  func (ts *trafficSwitcher) changeTableSourceWrites(ctx context.Context, access accessType) error {
  1152  	return ts.ForAllSources(func(source *workflow.MigrationSource) error {
  1153  		if _, err := ts.TopoServer().UpdateShardFields(ctx, ts.SourceKeyspaceName(), source.GetShard().ShardName(), func(si *topo.ShardInfo) error {
  1154  			return si.UpdateSourceDeniedTables(ctx, topodatapb.TabletType_PRIMARY, nil, access == allowWrites /* remove */, ts.Tables())
  1155  		}); err != nil {
  1156  			return err
  1157  		}
  1158  		rtbsCtx, cancel := context.WithTimeout(ctx, shardTabletRefreshTimeout)
  1159  		defer cancel()
  1160  		isPartial, partialDetails, err := topotools.RefreshTabletsByShard(rtbsCtx, ts.TopoServer(), ts.TabletManagerClient(), source.GetShard(), nil, ts.Logger())
  1161  		if isPartial {
  1162  			err = fmt.Errorf("failed to successfully refresh all tablets in the %s/%s source shard (%v):\n  %v",
  1163  				source.GetShard().Keyspace(), source.GetShard().ShardName(), err, partialDetails)
  1164  		}
  1165  		return err
  1166  	})
  1167  }
  1168  
  1169  // executeLockTablesOnSource executes a LOCK TABLES tb1 READ, tbl2 READ,... statement on each
  1170  // source shard's primary tablet using a non-pooled connection as the DBA user. The connection
  1171  // is closed when the LOCK TABLES statement returns, so we immediately release the LOCKs.
  1172  func (ts *trafficSwitcher) executeLockTablesOnSource(ctx context.Context) error {
  1173  	ts.Logger().Infof("Locking (and then immediately unlocking) the following tables on source keyspace %v: %v", ts.SourceKeyspaceName(), ts.Tables())
  1174  	if len(ts.Tables()) == 0 {
  1175  		return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "no tables found in the source keyspace %v associated with the %s workflow", ts.SourceKeyspaceName(), ts.WorkflowName())
  1176  	}
  1177  
  1178  	sb := strings.Builder{}
  1179  	sb.WriteString("LOCK TABLES ")
  1180  	for _, tableName := range ts.Tables() {
  1181  		sb.WriteString(fmt.Sprintf("%s READ,", sqlescape.EscapeID(tableName)))
  1182  	}
  1183  	// trim extra trailing comma
  1184  	lockStmt := sb.String()[:sb.Len()-1]
  1185  
  1186  	return ts.ForAllSources(func(source *workflow.MigrationSource) error {
  1187  		primary := source.GetPrimary()
  1188  		if primary == nil {
  1189  			return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "no primary found for source shard %s", source.GetShard())
  1190  		}
  1191  		tablet := primary.Tablet
  1192  		_, err := ts.wr.ExecuteFetchAsDba(ctx, tablet.Alias, lockStmt, 1, false, true)
  1193  		if err != nil {
  1194  			ts.Logger().Errorf("Error executing %s on source tablet %v: %v", lockStmt, tablet, err)
  1195  			return err
  1196  		}
  1197  		return err
  1198  	})
  1199  }
  1200  
  1201  func (ts *trafficSwitcher) waitForCatchup(ctx context.Context, filteredReplicationWaitTime time.Duration) error {
  1202  	ctx, cancel := context.WithTimeout(ctx, filteredReplicationWaitTime)
  1203  	defer cancel()
  1204  	// source writes have been stopped, wait for all streams on targets to catch up
  1205  	if err := ts.ForAllUIDs(func(target *workflow.MigrationTarget, uid uint32) error {
  1206  		ts.Logger().Infof("Before Catchup: uid: %d, target primary %s, target position %s, shard %s", uid,
  1207  			target.GetPrimary().AliasString(), target.Position, target.GetShard().String())
  1208  		bls := target.Sources[uid]
  1209  		source := ts.Sources()[bls.Shard]
  1210  		ts.Logger().Infof("Before Catchup: waiting for keyspace:shard: %v:%v to reach source position %v, uid %d",
  1211  			ts.TargetKeyspaceName(), target.GetShard().ShardName(), source.Position, uid)
  1212  		if err := ts.TabletManagerClient().VReplicationWaitForPos(ctx, target.GetPrimary().Tablet, int(uid), source.Position); err != nil {
  1213  			return err
  1214  		}
  1215  		log.Infof("After catchup: target keyspace:shard: %v:%v, source position %v, uid %d",
  1216  			ts.TargetKeyspaceName(), target.GetShard().ShardName(), source.Position, uid)
  1217  		ts.Logger().Infof("After catchup: position for keyspace:shard: %v:%v reached, uid %d",
  1218  			ts.TargetKeyspaceName(), target.GetShard().ShardName(), uid)
  1219  		if _, err := ts.TabletManagerClient().VReplicationExec(ctx, target.GetPrimary().Tablet, binlogplayer.StopVReplication(uid, "stopped for cutover")); err != nil {
  1220  			log.Infof("error marking stopped for cutover on %s, uid %d", target.GetPrimary().AliasString(), uid)
  1221  			return err
  1222  		}
  1223  		return nil
  1224  	}); err != nil {
  1225  		return err
  1226  	}
  1227  	// all targets have caught up, record their positions for setting up reverse workflows
  1228  	return ts.ForAllTargets(func(target *workflow.MigrationTarget) error {
  1229  		var err error
  1230  		target.Position, err = ts.TabletManagerClient().PrimaryPosition(ctx, target.GetPrimary().Tablet)
  1231  		ts.Logger().Infof("After catchup, position for target primary %s, %v", target.GetPrimary().AliasString(), target.Position)
  1232  		return err
  1233  	})
  1234  }
  1235  
  1236  func (ts *trafficSwitcher) cancelMigration(ctx context.Context, sm *workflow.StreamMigrator) {
  1237  	var err error
  1238  	if ts.MigrationType() == binlogdatapb.MigrationType_TABLES {
  1239  		err = ts.changeTableSourceWrites(ctx, allowWrites)
  1240  	} else {
  1241  		err = ts.changeShardsAccess(ctx, ts.SourceKeyspaceName(), ts.SourceShards(), allowWrites)
  1242  	}
  1243  	if err != nil {
  1244  		ts.Logger().Errorf("Cancel migration failed:", err)
  1245  	}
  1246  
  1247  	sm.CancelMigration(ctx)
  1248  
  1249  	err = ts.ForAllTargets(func(target *workflow.MigrationTarget) error {
  1250  		query := fmt.Sprintf("update _vt.vreplication set state='Running', message='' where db_name=%s and workflow=%s", encodeString(target.GetPrimary().DbName()), encodeString(ts.WorkflowName()))
  1251  		_, err := ts.TabletManagerClient().VReplicationExec(ctx, target.GetPrimary().Tablet, query)
  1252  		return err
  1253  	})
  1254  	if err != nil {
  1255  		ts.Logger().Errorf("Cancel migration failed: could not restart vreplication: %v", err)
  1256  	}
  1257  
  1258  	err = ts.deleteReverseVReplication(ctx)
  1259  	if err != nil {
  1260  		ts.Logger().Errorf("Cancel migration failed: could not delete revers vreplication entries: %v", err)
  1261  	}
  1262  }
  1263  
  1264  func (ts *trafficSwitcher) gatherPositions(ctx context.Context) error {
  1265  	err := ts.ForAllSources(func(source *workflow.MigrationSource) error {
  1266  		var err error
  1267  		source.Position, err = ts.TabletManagerClient().PrimaryPosition(ctx, source.GetPrimary().Tablet)
  1268  		ts.Logger().Infof("Position for source %v:%v: %v", ts.SourceKeyspaceName(), source.GetShard().ShardName(), source.Position)
  1269  		return err
  1270  	})
  1271  	if err != nil {
  1272  		return err
  1273  	}
  1274  	return ts.ForAllTargets(func(target *workflow.MigrationTarget) error {
  1275  		var err error
  1276  		target.Position, err = ts.TabletManagerClient().PrimaryPosition(ctx, target.GetPrimary().Tablet)
  1277  		ts.Logger().Infof("Position for target %v:%v: %v", ts.TargetKeyspaceName(), target.GetShard().ShardName(), target.Position)
  1278  		return err
  1279  	})
  1280  }
  1281  
  1282  func (ts *trafficSwitcher) createReverseVReplication(ctx context.Context) error {
  1283  	if err := ts.deleteReverseVReplication(ctx); err != nil {
  1284  		return err
  1285  	}
  1286  	err := ts.ForAllUIDs(func(target *workflow.MigrationTarget, uid uint32) error {
  1287  		bls := target.Sources[uid]
  1288  		source := ts.Sources()[bls.Shard]
  1289  		reverseBls := &binlogdatapb.BinlogSource{
  1290  			Keyspace:       ts.TargetKeyspaceName(),
  1291  			Shard:          target.GetShard().ShardName(),
  1292  			TabletType:     bls.TabletType,
  1293  			Filter:         &binlogdatapb.Filter{},
  1294  			OnDdl:          bls.OnDdl,
  1295  			SourceTimeZone: bls.TargetTimeZone,
  1296  			TargetTimeZone: bls.SourceTimeZone,
  1297  		}
  1298  
  1299  		for _, rule := range bls.Filter.Rules {
  1300  			if rule.Filter == "exclude" {
  1301  				reverseBls.Filter.Rules = append(reverseBls.Filter.Rules, rule)
  1302  				continue
  1303  			}
  1304  			var filter string
  1305  			if strings.HasPrefix(rule.Match, "/") {
  1306  				if ts.SourceKeyspaceSchema().Keyspace.Sharded {
  1307  					filter = key.KeyRangeString(source.GetShard().KeyRange)
  1308  				}
  1309  			} else {
  1310  				var inKeyrange string
  1311  				if ts.SourceKeyspaceSchema().Keyspace.Sharded {
  1312  					vtable, ok := ts.SourceKeyspaceSchema().Tables[rule.Match]
  1313  					if !ok {
  1314  						return fmt.Errorf("table %s not found in vschema1", rule.Match)
  1315  					}
  1316  					// TODO(sougou): handle degenerate cases like sequence, etc.
  1317  					// We currently assume the primary vindex is the best way to filter, which may not be true.
  1318  					inKeyrange = fmt.Sprintf(" where in_keyrange(%s, '%s.%s', '%s')", sqlparser.String(vtable.ColumnVindexes[0].Columns[0]), ts.SourceKeyspaceName(), vtable.ColumnVindexes[0].Name, key.KeyRangeString(source.GetShard().KeyRange))
  1319  				}
  1320  				filter = fmt.Sprintf("select * from %s%s", sqlescape.EscapeID(rule.Match), inKeyrange)
  1321  			}
  1322  			reverseBls.Filter.Rules = append(reverseBls.Filter.Rules, &binlogdatapb.Rule{
  1323  				Match:  rule.Match,
  1324  				Filter: filter,
  1325  			})
  1326  		}
  1327  		log.Infof("Creating reverse workflow vreplication stream on tablet %s: workflow %s, startPos %s",
  1328  			source.GetPrimary().Alias, ts.ReverseWorkflowName(), target.Position)
  1329  		_, err := ts.VReplicationExec(ctx, source.GetPrimary().Alias,
  1330  			binlogplayer.CreateVReplicationState(ts.ReverseWorkflowName(), reverseBls, target.Position,
  1331  				binlogplayer.BlpStopped, source.GetPrimary().DbName(), ts.workflowType, ts.workflowSubType))
  1332  		if err != nil {
  1333  			return err
  1334  		}
  1335  
  1336  		// if user has defined the cell/tablet_types parameters in the forward workflow, update the reverse workflow as well
  1337  		updateQuery := ts.getReverseVReplicationUpdateQuery(target.GetPrimary().Alias.Cell, source.GetPrimary().Alias.Cell, source.GetPrimary().DbName())
  1338  		if updateQuery != "" {
  1339  			log.Infof("Updating vreplication stream entry on %s with: %s", source.GetPrimary().Alias, updateQuery)
  1340  			_, err = ts.VReplicationExec(ctx, source.GetPrimary().Alias, updateQuery)
  1341  			return err
  1342  		}
  1343  		return nil
  1344  	})
  1345  	return err
  1346  }
  1347  
  1348  func (ts *trafficSwitcher) getReverseVReplicationUpdateQuery(targetCell string, sourceCell string, dbname string) string {
  1349  	// we try to be clever to understand what user intends:
  1350  	// if target's cell is present in cells but not source's cell we replace it with the source's cell
  1351  	if ts.optCells != "" && targetCell != sourceCell && strings.Contains(ts.optCells+",", targetCell+",") &&
  1352  		!strings.Contains(ts.optCells+",", sourceCell+",") {
  1353  		ts.optCells = strings.Replace(ts.optCells, targetCell, sourceCell, 1)
  1354  	}
  1355  
  1356  	if ts.optCells != "" || ts.optTabletTypes != "" {
  1357  		query := fmt.Sprintf("update _vt.vreplication set cell = '%s', tablet_types = '%s' where workflow = '%s' and db_name = '%s'",
  1358  			ts.optCells, ts.optTabletTypes, ts.ReverseWorkflowName(), dbname)
  1359  		return query
  1360  	}
  1361  	return ""
  1362  }
  1363  
  1364  func (ts *trafficSwitcher) deleteReverseVReplication(ctx context.Context) error {
  1365  	return ts.ForAllSources(func(source *workflow.MigrationSource) error {
  1366  		query := fmt.Sprintf(sqlDeleteWorkflow, encodeString(source.GetPrimary().DbName()), encodeString(ts.reverseWorkflow))
  1367  		if _, err := ts.TabletManagerClient().VReplicationExec(ctx, source.GetPrimary().Tablet, query); err != nil {
  1368  			return err
  1369  		}
  1370  		ts.wr.deleteWorkflowVDiffData(ctx, source.GetPrimary().Tablet, ts.reverseWorkflow)
  1371  		ts.wr.optimizeCopyStateTable(source.GetPrimary().Tablet)
  1372  		return nil
  1373  	})
  1374  }
  1375  
  1376  func (ts *trafficSwitcher) createJournals(ctx context.Context, sourceWorkflows []string) error {
  1377  	log.Infof("In createJournals for source workflows %+v", sourceWorkflows)
  1378  	return ts.ForAllSources(func(source *workflow.MigrationSource) error {
  1379  		if source.Journaled {
  1380  			return nil
  1381  		}
  1382  		participants := make([]*binlogdatapb.KeyspaceShard, 0)
  1383  		participantMap := make(map[string]bool)
  1384  		journal := &binlogdatapb.Journal{
  1385  			Id:              ts.id,
  1386  			MigrationType:   ts.MigrationType(),
  1387  			Tables:          ts.Tables(),
  1388  			LocalPosition:   source.Position,
  1389  			Participants:    participants,
  1390  			SourceWorkflows: sourceWorkflows,
  1391  		}
  1392  		for targetShard, target := range ts.Targets() {
  1393  			for _, tsource := range target.Sources {
  1394  				participantMap[tsource.Shard] = true
  1395  			}
  1396  			journal.ShardGtids = append(journal.ShardGtids, &binlogdatapb.ShardGtid{
  1397  				Keyspace: ts.TargetKeyspaceName(),
  1398  				Shard:    targetShard,
  1399  				Gtid:     target.Position,
  1400  			})
  1401  		}
  1402  		shards := make([]string, 0)
  1403  		for shard := range participantMap {
  1404  			shards = append(shards, shard)
  1405  		}
  1406  		sort.Sort(vreplication.ShardSorter(shards))
  1407  		for _, shard := range shards {
  1408  			journal.Participants = append(journal.Participants, &binlogdatapb.KeyspaceShard{
  1409  				Keyspace: source.GetShard().Keyspace(),
  1410  				Shard:    shard,
  1411  			})
  1412  
  1413  		}
  1414  		log.Infof("Creating journal %v", journal)
  1415  		ts.Logger().Infof("Creating journal: %v", journal)
  1416  		statement := fmt.Sprintf("insert into _vt.resharding_journal "+
  1417  			"(id, db_name, val) "+
  1418  			"values (%v, %v, %v)",
  1419  			ts.id, encodeString(source.GetPrimary().DbName()), encodeString(journal.String()))
  1420  		if _, err := ts.TabletManagerClient().VReplicationExec(ctx, source.GetPrimary().Tablet, statement); err != nil {
  1421  			return err
  1422  		}
  1423  		return nil
  1424  	})
  1425  }
  1426  
  1427  func (ts *trafficSwitcher) allowTargetWrites(ctx context.Context) error {
  1428  	if ts.MigrationType() == binlogdatapb.MigrationType_TABLES {
  1429  		return ts.allowTableTargetWrites(ctx)
  1430  	}
  1431  	return ts.changeShardsAccess(ctx, ts.TargetKeyspaceName(), ts.TargetShards(), allowWrites)
  1432  }
  1433  
  1434  func (ts *trafficSwitcher) allowTableTargetWrites(ctx context.Context) error {
  1435  	return ts.ForAllTargets(func(target *workflow.MigrationTarget) error {
  1436  		if _, err := ts.TopoServer().UpdateShardFields(ctx, ts.TargetKeyspaceName(), target.GetShard().ShardName(), func(si *topo.ShardInfo) error {
  1437  			return si.UpdateSourceDeniedTables(ctx, topodatapb.TabletType_PRIMARY, nil, true, ts.Tables())
  1438  		}); err != nil {
  1439  			return err
  1440  		}
  1441  		rtbsCtx, cancel := context.WithTimeout(ctx, shardTabletRefreshTimeout)
  1442  		defer cancel()
  1443  		_, _, err := topotools.RefreshTabletsByShard(rtbsCtx, ts.TopoServer(), ts.TabletManagerClient(), target.GetShard(), nil, ts.Logger())
  1444  		return err
  1445  	})
  1446  }
  1447  
  1448  func (ts *trafficSwitcher) changeRouting(ctx context.Context) error {
  1449  	if ts.MigrationType() == binlogdatapb.MigrationType_TABLES {
  1450  		return ts.changeWriteRoute(ctx)
  1451  	}
  1452  	return ts.changeShardRouting(ctx)
  1453  }
  1454  
  1455  func (ts *trafficSwitcher) changeWriteRoute(ctx context.Context) error {
  1456  	if ts.isPartialMigration {
  1457  		srr, err := topotools.GetShardRoutingRules(ctx, ts.TopoServer())
  1458  		if err != nil {
  1459  			return err
  1460  		}
  1461  		for _, si := range ts.SourceShards() {
  1462  			delete(srr, fmt.Sprintf("%s.%s", ts.TargetKeyspaceName(), si.ShardName()))
  1463  			ts.Logger().Infof("Deleted shard routing: %v:%v", ts.TargetKeyspaceName(), si.ShardName())
  1464  			srr[fmt.Sprintf("%s.%s", ts.SourceKeyspaceName(), si.ShardName())] = ts.TargetKeyspaceName()
  1465  			ts.Logger().Infof("Added shard routing: %v:%v", ts.SourceKeyspaceName(), si.ShardName())
  1466  		}
  1467  		if err := topotools.SaveShardRoutingRules(ctx, ts.TopoServer(), srr); err != nil {
  1468  			return err
  1469  		}
  1470  	} else {
  1471  		rules, err := topotools.GetRoutingRules(ctx, ts.TopoServer())
  1472  		if err != nil {
  1473  			return err
  1474  		}
  1475  		for _, table := range ts.Tables() {
  1476  			targetKsTable := fmt.Sprintf("%s.%s", ts.TargetKeyspaceName(), table)
  1477  			sourceKsTable := fmt.Sprintf("%s.%s", ts.SourceKeyspaceName(), table)
  1478  			delete(rules, targetKsTable)
  1479  			ts.Logger().Infof("Deleted routing: %s", targetKsTable)
  1480  			rules[table] = []string{targetKsTable}
  1481  			rules[sourceKsTable] = []string{targetKsTable}
  1482  			ts.Logger().Infof("Added routing: %v %v", table, sourceKsTable)
  1483  		}
  1484  		if err := topotools.SaveRoutingRules(ctx, ts.TopoServer(), rules); err != nil {
  1485  			return err
  1486  		}
  1487  	}
  1488  
  1489  	return ts.TopoServer().RebuildSrvVSchema(ctx, nil)
  1490  }
  1491  
  1492  func (ts *trafficSwitcher) changeShardRouting(ctx context.Context) error {
  1493  	if err := ts.TopoServer().ValidateSrvKeyspace(ctx, ts.TargetKeyspaceName(), ""); err != nil {
  1494  		err2 := vterrors.Wrapf(err, "Before changing shard routes, found SrvKeyspace for %s is corrupt", ts.TargetKeyspaceName())
  1495  		log.Errorf("%w", err2)
  1496  		return err2
  1497  	}
  1498  	err := ts.ForAllSources(func(source *workflow.MigrationSource) error {
  1499  		_, err := ts.TopoServer().UpdateShardFields(ctx, ts.SourceKeyspaceName(), source.GetShard().ShardName(), func(si *topo.ShardInfo) error {
  1500  			si.IsPrimaryServing = false
  1501  			return nil
  1502  		})
  1503  		return err
  1504  	})
  1505  	if err != nil {
  1506  		return err
  1507  	}
  1508  	err = ts.ForAllTargets(func(target *workflow.MigrationTarget) error {
  1509  		_, err := ts.TopoServer().UpdateShardFields(ctx, ts.TargetKeyspaceName(), target.GetShard().ShardName(), func(si *topo.ShardInfo) error {
  1510  			si.IsPrimaryServing = true
  1511  			return nil
  1512  		})
  1513  		return err
  1514  	})
  1515  	if err != nil {
  1516  		return err
  1517  	}
  1518  	err = ts.TopoServer().MigrateServedType(ctx, ts.TargetKeyspaceName(), ts.TargetShards(), ts.SourceShards(), topodatapb.TabletType_PRIMARY, nil)
  1519  	if err != nil {
  1520  		return err
  1521  	}
  1522  	if err := ts.TopoServer().ValidateSrvKeyspace(ctx, ts.TargetKeyspaceName(), ""); err != nil {
  1523  		err2 := vterrors.Wrapf(err, "After changing shard routes, found SrvKeyspace for %s is corrupt", ts.TargetKeyspaceName())
  1524  		log.Errorf("%w", err2)
  1525  		return err2
  1526  	}
  1527  	return nil
  1528  }
  1529  
  1530  func (ts *trafficSwitcher) deleteShardRoutingRules(ctx context.Context) error {
  1531  	if !ts.isPartialMigration {
  1532  		return nil
  1533  	}
  1534  	srr, err := topotools.GetShardRoutingRules(ctx, ts.TopoServer())
  1535  	if err != nil {
  1536  		return err
  1537  	}
  1538  	for _, si := range ts.TargetShards() {
  1539  		delete(srr, fmt.Sprintf("%s.%s", ts.targetKeyspace, si.ShardName()))
  1540  	}
  1541  	if err := topotools.SaveShardRoutingRules(ctx, ts.TopoServer(), srr); err != nil {
  1542  		return err
  1543  	}
  1544  	return nil
  1545  }
  1546  
  1547  func (ts *trafficSwitcher) startReverseVReplication(ctx context.Context) error {
  1548  	return ts.ForAllSources(func(source *workflow.MigrationSource) error {
  1549  		query := fmt.Sprintf("update _vt.vreplication set state='Running', message='' where db_name=%s", encodeString(source.GetPrimary().DbName()))
  1550  		_, err := ts.VReplicationExec(ctx, source.GetPrimary().Alias, query)
  1551  		return err
  1552  	})
  1553  }
  1554  
  1555  func (ts *trafficSwitcher) changeShardsAccess(ctx context.Context, keyspace string, shards []*topo.ShardInfo, access accessType) error {
  1556  	if err := ts.TopoServer().UpdateDisableQueryService(ctx, keyspace, shards, topodatapb.TabletType_PRIMARY, nil, access == disallowWrites /* disable */); err != nil {
  1557  		return err
  1558  	}
  1559  	return ts.wr.refreshPrimaryTablets(ctx, shards)
  1560  }
  1561  
  1562  func (ts *trafficSwitcher) SourceShards() []*topo.ShardInfo {
  1563  	shards := make([]*topo.ShardInfo, 0, len(ts.Sources()))
  1564  	for _, source := range ts.Sources() {
  1565  		shards = append(shards, source.GetShard())
  1566  	}
  1567  	return shards
  1568  }
  1569  
  1570  func (ts *trafficSwitcher) TargetShards() []*topo.ShardInfo {
  1571  	shards := make([]*topo.ShardInfo, 0, len(ts.Targets()))
  1572  	for _, target := range ts.Targets() {
  1573  		shards = append(shards, target.GetShard())
  1574  	}
  1575  	return shards
  1576  }
  1577  
  1578  func (ts *trafficSwitcher) dropSourceDeniedTables(ctx context.Context) error {
  1579  	return ts.ForAllSources(func(source *workflow.MigrationSource) error {
  1580  		if _, err := ts.TopoServer().UpdateShardFields(ctx, ts.SourceKeyspaceName(), source.GetShard().ShardName(), func(si *topo.ShardInfo) error {
  1581  			return si.UpdateSourceDeniedTables(ctx, topodatapb.TabletType_PRIMARY, nil, true, ts.Tables())
  1582  		}); err != nil {
  1583  			return err
  1584  		}
  1585  		rtbsCtx, cancel := context.WithTimeout(ctx, shardTabletRefreshTimeout)
  1586  		defer cancel()
  1587  		_, _, err := topotools.RefreshTabletsByShard(rtbsCtx, ts.TopoServer(), ts.TabletManagerClient(), source.GetShard(), nil, ts.Logger())
  1588  		return err
  1589  	})
  1590  }
  1591  
  1592  func (ts *trafficSwitcher) validateWorkflowHasCompleted(ctx context.Context) error {
  1593  	return doValidateWorkflowHasCompleted(ctx, ts)
  1594  }
  1595  
  1596  func doValidateWorkflowHasCompleted(ctx context.Context, ts *trafficSwitcher) error {
  1597  	wg := sync.WaitGroup{}
  1598  	rec := concurrency.AllErrorRecorder{}
  1599  	if ts.MigrationType() == binlogdatapb.MigrationType_SHARDS {
  1600  		_ = ts.ForAllSources(func(source *workflow.MigrationSource) error {
  1601  			wg.Add(1)
  1602  			if source.GetShard().IsPrimaryServing {
  1603  				rec.RecordError(fmt.Errorf(fmt.Sprintf("Shard %s is still serving", source.GetShard().ShardName())))
  1604  			}
  1605  			wg.Done()
  1606  			return nil
  1607  		})
  1608  	} else {
  1609  		_ = ts.ForAllTargets(func(target *workflow.MigrationTarget) error {
  1610  			wg.Add(1)
  1611  			query := fmt.Sprintf("select 1 from _vt.vreplication where db_name='%s' and workflow='%s' and message!='FROZEN'", target.GetPrimary().DbName(), ts.WorkflowName())
  1612  			rs, _ := ts.VReplicationExec(ctx, target.GetPrimary().Alias, query)
  1613  			if len(rs.Rows) > 0 {
  1614  				rec.RecordError(fmt.Errorf("vreplication streams are not frozen on tablet %d", target.GetPrimary().Alias.Uid))
  1615  			}
  1616  			wg.Done()
  1617  			return nil
  1618  		})
  1619  	}
  1620  	wg.Wait()
  1621  
  1622  	if !ts.keepRoutingRules {
  1623  		//check if table is routable
  1624  		if ts.MigrationType() == binlogdatapb.MigrationType_TABLES {
  1625  			rules, err := topotools.GetRoutingRules(ctx, ts.TopoServer())
  1626  			if err != nil {
  1627  				rec.RecordError(fmt.Errorf("could not get RoutingRules"))
  1628  			}
  1629  			for fromTable, toTables := range rules {
  1630  				for _, toTable := range toTables {
  1631  					for _, table := range ts.Tables() {
  1632  						if toTable == fmt.Sprintf("%s.%s", ts.SourceKeyspaceName(), table) {
  1633  							rec.RecordError(fmt.Errorf("routing still exists from keyspace %s table %s to %s", ts.SourceKeyspaceName(), table, fromTable))
  1634  						}
  1635  					}
  1636  				}
  1637  			}
  1638  		}
  1639  	}
  1640  	if rec.HasErrors() {
  1641  		return fmt.Errorf("%s", strings.Join(rec.ErrorStrings(), "\n"))
  1642  	}
  1643  	return nil
  1644  
  1645  }
  1646  
  1647  func getRenameFileName(tableName string) string {
  1648  	return fmt.Sprintf(renameTableTemplate, tableName)
  1649  }
  1650  
  1651  func (ts *trafficSwitcher) removeSourceTables(ctx context.Context, removalType workflow.TableRemovalType) error {
  1652  	err := ts.ForAllSources(func(source *workflow.MigrationSource) error {
  1653  		for _, tableName := range ts.Tables() {
  1654  			query := fmt.Sprintf("drop table %s.%s",
  1655  				sqlescape.EscapeID(sqlescape.UnescapeID(source.GetPrimary().DbName())),
  1656  				sqlescape.EscapeID(sqlescape.UnescapeID(tableName)))
  1657  			if removalType == workflow.DropTable {
  1658  				ts.Logger().Infof("%s: Dropping table %s.%s\n",
  1659  					source.GetPrimary().String(), source.GetPrimary().DbName(), tableName)
  1660  			} else {
  1661  				renameName := getRenameFileName(tableName)
  1662  				ts.Logger().Infof("%s: Renaming table %s.%s to %s.%s\n",
  1663  					source.GetPrimary().String(), source.GetPrimary().DbName(), tableName, source.GetPrimary().DbName(), renameName)
  1664  				query = fmt.Sprintf("rename table %s.%s TO %s.%s",
  1665  					sqlescape.EscapeID(sqlescape.UnescapeID(source.GetPrimary().DbName())),
  1666  					sqlescape.EscapeID(sqlescape.UnescapeID(tableName)),
  1667  					sqlescape.EscapeID(sqlescape.UnescapeID(source.GetPrimary().DbName())),
  1668  					sqlescape.EscapeID(sqlescape.UnescapeID(renameName)))
  1669  			}
  1670  			_, err := ts.wr.ExecuteFetchAsDba(ctx, source.GetPrimary().Alias, query, 1, false, true)
  1671  			if err != nil {
  1672  				ts.Logger().Errorf("%s: Error removing table %s: %v", source.GetPrimary().String(), tableName, err)
  1673  				return err
  1674  			}
  1675  			ts.Logger().Infof("%s: Removed table %s.%s\n", source.GetPrimary().String(), source.GetPrimary().DbName(), tableName)
  1676  
  1677  		}
  1678  		return nil
  1679  	})
  1680  	if err != nil {
  1681  		return err
  1682  	}
  1683  
  1684  	return ts.dropParticipatingTablesFromKeyspace(ctx, ts.SourceKeyspaceName())
  1685  }
  1686  
  1687  func (ts *trafficSwitcher) dropParticipatingTablesFromKeyspace(ctx context.Context, keyspace string) error {
  1688  	vschema, err := ts.TopoServer().GetVSchema(ctx, keyspace)
  1689  	if err != nil {
  1690  		return err
  1691  	}
  1692  	for _, tableName := range ts.Tables() {
  1693  		delete(vschema.Tables, tableName)
  1694  	}
  1695  	return ts.TopoServer().SaveVSchema(ctx, keyspace, vschema)
  1696  }
  1697  
  1698  // FIXME: even after dropSourceShards there are still entries in the topo, need to research and fix
  1699  func (ts *trafficSwitcher) dropSourceShards(ctx context.Context) error {
  1700  	return ts.ForAllSources(func(source *workflow.MigrationSource) error {
  1701  		ts.Logger().Infof("Deleting shard %s.%s\n", source.GetShard().Keyspace(), source.GetShard().ShardName())
  1702  		err := ts.wr.DeleteShard(ctx, source.GetShard().Keyspace(), source.GetShard().ShardName(), true, false)
  1703  		if err != nil {
  1704  			ts.Logger().Errorf("Error deleting shard %s: %v", source.GetShard().ShardName(), err)
  1705  			return err
  1706  		}
  1707  		ts.Logger().Infof("Deleted shard %s.%s\n", source.GetShard().Keyspace(), source.GetShard().ShardName())
  1708  		return nil
  1709  	})
  1710  }
  1711  
  1712  func (ts *trafficSwitcher) freezeTargetVReplication(ctx context.Context) error {
  1713  	// Mark target streams as frozen before deleting. If SwitchWrites gets
  1714  	// re-invoked after a freeze, it will skip all the previous steps
  1715  	err := ts.ForAllTargets(func(target *workflow.MigrationTarget) error {
  1716  		ts.Logger().Infof("Marking target streams frozen for workflow %s db_name %s", ts.WorkflowName(), target.GetPrimary().DbName())
  1717  		query := fmt.Sprintf("update _vt.vreplication set message = '%s' where db_name=%s and workflow=%s", workflow.Frozen, encodeString(target.GetPrimary().DbName()), encodeString(ts.WorkflowName()))
  1718  		_, err := ts.TabletManagerClient().VReplicationExec(ctx, target.GetPrimary().Tablet, query)
  1719  		return err
  1720  	})
  1721  	if err != nil {
  1722  		return err
  1723  	}
  1724  	return nil
  1725  }
  1726  
  1727  func (ts *trafficSwitcher) dropTargetVReplicationStreams(ctx context.Context) error {
  1728  	return ts.ForAllTargets(func(target *workflow.MigrationTarget) error {
  1729  		ts.Logger().Infof("Deleting target streams and related data for workflow %s db_name %s", ts.WorkflowName(), target.GetPrimary().DbName())
  1730  		query := fmt.Sprintf(sqlDeleteWorkflow, encodeString(target.GetPrimary().DbName()), encodeString(ts.WorkflowName()))
  1731  		if _, err := ts.TabletManagerClient().VReplicationExec(ctx, target.GetPrimary().Tablet, query); err != nil {
  1732  			return err
  1733  		}
  1734  		ts.wr.deleteWorkflowVDiffData(ctx, target.GetPrimary().Tablet, ts.WorkflowName())
  1735  		ts.wr.optimizeCopyStateTable(target.GetPrimary().Tablet)
  1736  		return nil
  1737  	})
  1738  }
  1739  
  1740  func (ts *trafficSwitcher) dropSourceReverseVReplicationStreams(ctx context.Context) error {
  1741  	return ts.ForAllSources(func(source *workflow.MigrationSource) error {
  1742  		ts.Logger().Infof("Deleting reverse streams and related data for workflow %s db_name %s", ts.WorkflowName(), source.GetPrimary().DbName())
  1743  		query := fmt.Sprintf(sqlDeleteWorkflow, encodeString(source.GetPrimary().DbName()), encodeString(workflow.ReverseWorkflowName(ts.WorkflowName())))
  1744  		if _, err := ts.TabletManagerClient().VReplicationExec(ctx, source.GetPrimary().Tablet, query); err != nil {
  1745  			return err
  1746  		}
  1747  		ts.wr.deleteWorkflowVDiffData(ctx, source.GetPrimary().Tablet, workflow.ReverseWorkflowName(ts.WorkflowName()))
  1748  		ts.wr.optimizeCopyStateTable(source.GetPrimary().Tablet)
  1749  		return nil
  1750  	})
  1751  }
  1752  
  1753  func (ts *trafficSwitcher) removeTargetTables(ctx context.Context) error {
  1754  	log.Infof("removeTargetTables")
  1755  	err := ts.ForAllTargets(func(target *workflow.MigrationTarget) error {
  1756  		for _, tableName := range ts.Tables() {
  1757  			query := fmt.Sprintf("drop table %s.%s",
  1758  				sqlescape.EscapeID(sqlescape.UnescapeID(target.GetPrimary().DbName())),
  1759  				sqlescape.EscapeID(sqlescape.UnescapeID(tableName)))
  1760  			ts.Logger().Infof("%s: Dropping table %s.%s\n",
  1761  				target.GetPrimary().String(), target.GetPrimary().DbName(), tableName)
  1762  			_, err := ts.wr.ExecuteFetchAsDba(ctx, target.GetPrimary().Alias, query, 1, false, true)
  1763  			if err != nil {
  1764  				ts.Logger().Errorf("%s: Error removing table %s: %v",
  1765  					target.GetPrimary().String(), tableName, err)
  1766  				return err
  1767  			}
  1768  			ts.Logger().Infof("%s: Removed table %s.%s\n",
  1769  				target.GetPrimary().String(), target.GetPrimary().DbName(), tableName)
  1770  
  1771  		}
  1772  		return nil
  1773  	})
  1774  	if err != nil {
  1775  		return err
  1776  	}
  1777  
  1778  	return ts.dropParticipatingTablesFromKeyspace(ctx, ts.TargetKeyspaceName())
  1779  
  1780  }
  1781  
  1782  func (ts *trafficSwitcher) dropTargetShards(ctx context.Context) error {
  1783  	return ts.ForAllTargets(func(target *workflow.MigrationTarget) error {
  1784  		ts.Logger().Infof("Deleting shard %s.%s\n", target.GetShard().Keyspace(), target.GetShard().ShardName())
  1785  		err := ts.wr.DeleteShard(ctx, target.GetShard().Keyspace(), target.GetShard().ShardName(), true, false)
  1786  		if err != nil {
  1787  			ts.Logger().Errorf("Error deleting shard %s: %v", target.GetShard().ShardName(), err)
  1788  			return err
  1789  		}
  1790  		ts.Logger().Infof("Deleted shard %s.%s\n", target.GetShard().Keyspace(), target.GetShard().ShardName())
  1791  		return nil
  1792  	})
  1793  }
  1794  
  1795  func (ts *trafficSwitcher) deleteRoutingRules(ctx context.Context) error {
  1796  	rules, err := topotools.GetRoutingRules(ctx, ts.TopoServer())
  1797  	if err != nil {
  1798  		return err
  1799  	}
  1800  	for _, table := range ts.Tables() {
  1801  		delete(rules, table)
  1802  		delete(rules, table+"@replica")
  1803  		delete(rules, table+"@rdonly")
  1804  		delete(rules, ts.TargetKeyspaceName()+"."+table)
  1805  		delete(rules, ts.TargetKeyspaceName()+"."+table+"@replica")
  1806  		delete(rules, ts.TargetKeyspaceName()+"."+table+"@rdonly")
  1807  		delete(rules, ts.SourceKeyspaceName()+"."+table)
  1808  		delete(rules, ts.SourceKeyspaceName()+"."+table+"@replica")
  1809  		delete(rules, ts.SourceKeyspaceName()+"."+table+"@rdonly")
  1810  	}
  1811  	if err := topotools.SaveRoutingRules(ctx, ts.TopoServer(), rules); err != nil {
  1812  		return err
  1813  	}
  1814  	return nil
  1815  }
  1816  
  1817  // addParticipatingTablesToKeyspace updates the vschema with the new tables that were created as part of the
  1818  // Migrate flow. It is called when the Migrate flow is Completed
  1819  func (ts *trafficSwitcher) addParticipatingTablesToKeyspace(ctx context.Context, keyspace, tableSpecs string) error {
  1820  	vschema, err := ts.TopoServer().GetVSchema(ctx, keyspace)
  1821  	if err != nil {
  1822  		return err
  1823  	}
  1824  	if vschema == nil {
  1825  		return fmt.Errorf("no vschema found for keyspace %s", keyspace)
  1826  	}
  1827  	if vschema.Tables == nil {
  1828  		vschema.Tables = make(map[string]*vschemapb.Table)
  1829  	}
  1830  	if strings.HasPrefix(tableSpecs, "{") { // user defined the vschema snippet, typically for a sharded target
  1831  		wrap := fmt.Sprintf(`{"tables": %s}`, tableSpecs)
  1832  		ks := &vschemapb.Keyspace{}
  1833  		if err := json2.Unmarshal([]byte(wrap), ks); err != nil {
  1834  			return err
  1835  		}
  1836  		if err != nil {
  1837  			return err
  1838  		}
  1839  		for table, vtab := range ks.Tables {
  1840  			vschema.Tables[table] = vtab
  1841  		}
  1842  	} else {
  1843  		if vschema.Sharded {
  1844  			return fmt.Errorf("no sharded vschema was provided, so you will need to update the vschema of the target manually for the moved tables")
  1845  		}
  1846  		for _, table := range ts.tables {
  1847  			vschema.Tables[table] = &vschemapb.Table{}
  1848  		}
  1849  	}
  1850  	return ts.TopoServer().SaveVSchema(ctx, keyspace, vschema)
  1851  }