github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/master/shardddl/optimist.go (about)

     1  // Copyright 2020 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package shardddl
    15  
    16  import (
    17  	"context"
    18  	"fmt"
    19  	"sort"
    20  	"sync"
    21  	"time"
    22  
    23  	"github.com/pingcap/failpoint"
    24  	"github.com/pingcap/tidb/pkg/util/dbutil"
    25  	"github.com/pingcap/tiflow/dm/common"
    26  	"github.com/pingcap/tiflow/dm/config"
    27  	"github.com/pingcap/tiflow/dm/config/dbconfig"
    28  	"github.com/pingcap/tiflow/dm/master/metrics"
    29  	"github.com/pingcap/tiflow/dm/pb"
    30  	"github.com/pingcap/tiflow/dm/pkg/etcdutil"
    31  	"github.com/pingcap/tiflow/dm/pkg/log"
    32  	"github.com/pingcap/tiflow/dm/pkg/shardddl/optimism"
    33  	"github.com/pingcap/tiflow/dm/pkg/terror"
    34  	"github.com/pingcap/tiflow/dm/pkg/utils"
    35  	clientv3 "go.etcd.io/etcd/client/v3"
    36  	"go.uber.org/zap"
    37  )
    38  
    39  // Optimist is used to coordinate the shard DDL migration in optimism mode.
    40  type Optimist struct {
    41  	mu sync.Mutex
    42  
    43  	logger log.Logger
    44  
    45  	closed bool
    46  	cancel context.CancelFunc
    47  	wg     sync.WaitGroup
    48  
    49  	cli *clientv3.Client
    50  	lk  *optimism.LockKeeper
    51  	tk  *optimism.TableKeeper
    52  }
    53  
    54  // NewOptimist creates a new Optimist instance.
    55  func NewOptimist(pLogger *log.Logger, getDownstreamMetaFunc func(string) (*dbconfig.DBConfig, string)) *Optimist {
    56  	return &Optimist{
    57  		logger: pLogger.WithFields(zap.String("component", "shard DDL optimist")),
    58  		closed: true,
    59  		lk:     optimism.NewLockKeeper(getDownstreamMetaFunc),
    60  		tk:     optimism.NewTableKeeper(),
    61  	}
    62  }
    63  
    64  // Start starts the shard DDL coordination in optimism mode.
    65  // NOTE: for logic errors, it should start without returning errors (but report via metrics or log) so that the user can fix them.
    66  func (o *Optimist) Start(pCtx context.Context, etcdCli *clientv3.Client) error {
    67  	o.logger.Info("the shard DDL optimist is starting")
    68  
    69  	o.mu.Lock()
    70  	defer o.mu.Unlock()
    71  
    72  	o.cli = etcdCli // o.cli should be set before watching and recover locks because these operations need o.cli
    73  
    74  	revSource, revInfo, revOperation, err := o.rebuildLocks()
    75  	if err != nil {
    76  		return err
    77  	}
    78  
    79  	ctx, cancel := context.WithCancel(pCtx)
    80  
    81  	o.wg.Add(1)
    82  	go func() {
    83  		defer o.wg.Done()
    84  		// TODO: handle fatal error from run
    85  		//nolint:errcheck
    86  		o.run(ctx, revSource, revInfo, revOperation)
    87  	}()
    88  
    89  	o.closed = false // started now, no error will interrupt the start process.
    90  	o.cancel = cancel
    91  	o.logger.Info("the shard DDL optimist has started")
    92  	return nil
    93  }
    94  
    95  // Close closes the Optimist instance.
    96  func (o *Optimist) Close() {
    97  	o.mu.Lock()
    98  	if o.closed {
    99  		o.mu.Unlock()
   100  		return
   101  	}
   102  
   103  	if o.cancel != nil {
   104  		o.cancel()
   105  		o.cancel = nil
   106  	}
   107  
   108  	o.closed = true // closed now.
   109  	o.mu.Unlock()
   110  	// unlock before wg.Wait() to avoid deadlock because other goroutines acquire the lock.
   111  	// such as https://github.com/pingcap/tiflow/blob/92fc4c4/dm/dm/master/shardddl/optimist.go#L686
   112  	o.wg.Wait()
   113  	o.logger.Info("the shard DDL optimist has closed")
   114  }
   115  
   116  // Locks return all shard DDL locks current exist.
   117  func (o *Optimist) Locks() map[string]*optimism.Lock {
   118  	return o.lk.Locks()
   119  }
   120  
   121  // ShowLocks is used by `show-ddl-locks` command.
   122  func (o *Optimist) ShowLocks(task string, sources []string) ([]*pb.DDLLock, error) {
   123  	locks := o.lk.Locks()
   124  	ret := make([]*pb.DDLLock, 0, len(locks))
   125  	var ifm map[string]map[string]map[string]map[string]optimism.Info
   126  	opm, _, err := optimism.GetAllOperations(o.cli)
   127  	if err == nil {
   128  		ifm, _, err = optimism.GetAllInfo(o.cli)
   129  	}
   130  	for _, lock := range locks {
   131  		if task != "" && task != lock.Task {
   132  			continue // specify task but mismatch
   133  		}
   134  		ready := lock.Ready()
   135  		if len(sources) > 0 {
   136  			for _, source := range sources {
   137  				if _, ok := ready[source]; ok {
   138  					goto FOUND // if any source matched, show lock for it.
   139  				}
   140  			}
   141  			continue // specify sources but mismath
   142  		}
   143  	FOUND:
   144  		var (
   145  			owners    []string
   146  			ddlGroups [][]string
   147  		)
   148  
   149  		appendOwnerDDLs := func(opmss map[string]map[string]optimism.Operation, source string) {
   150  			for schema, opmsst := range opmss {
   151  				for table, op := range opmsst {
   152  					if op.ConflictStage != optimism.ConflictDetected {
   153  						continue
   154  					}
   155  					if _, ok := ifm[lock.Task]; !ok {
   156  						continue
   157  					}
   158  					if _, ok := ifm[lock.Task][source]; !ok {
   159  						continue
   160  					}
   161  					if _, ok := ifm[lock.Task][source][schema]; !ok {
   162  						continue
   163  					}
   164  					if info, ok := ifm[lock.Task][source][schema][table]; ok {
   165  						owners = append(owners, utils.GenDDLLockID(source, schema, table))
   166  						ddlGroups = append(ddlGroups, info.DDLs)
   167  					}
   168  				}
   169  			}
   170  		}
   171  		if opms, ok := opm[lock.Task]; ok {
   172  			if len(sources) > 0 {
   173  				for _, source := range sources {
   174  					if opmss, ok := opms[source]; ok {
   175  						appendOwnerDDLs(opmss, source)
   176  					}
   177  				}
   178  			} else {
   179  				for source, opmss := range opms {
   180  					appendOwnerDDLs(opmss, source)
   181  				}
   182  			}
   183  		}
   184  		lockSynced := make([]string, 0, len(ready))
   185  		lockUnsynced := make([]string, 0, len(ready))
   186  		for source, schemaTables := range ready {
   187  			for schema, tables := range schemaTables {
   188  				for table, synced := range tables {
   189  					if synced {
   190  						lockSynced = append(lockSynced, fmt.Sprintf("%s-%s", source, dbutil.TableName(schema, table)))
   191  					} else {
   192  						lockUnsynced = append(lockUnsynced, fmt.Sprintf("%s-%s", source, dbutil.TableName(schema, table)))
   193  					}
   194  				}
   195  			}
   196  		}
   197  		sort.Strings(lockSynced)
   198  		sort.Strings(lockUnsynced)
   199  
   200  		if len(owners) == 0 {
   201  			owners = append(owners, "")
   202  			ddlGroups = append(ddlGroups, nil)
   203  		}
   204  		for i, owner := range owners {
   205  			ret = append(ret, &pb.DDLLock{
   206  				ID:       lock.ID,
   207  				Task:     lock.Task,
   208  				Mode:     config.ShardOptimistic,
   209  				Owner:    owner,
   210  				DDLs:     ddlGroups[i],
   211  				Synced:   lockSynced,
   212  				Unsynced: lockUnsynced,
   213  			})
   214  		}
   215  	}
   216  	return ret, err
   217  }
   218  
   219  // UnlockLock unlocks a shard DDL lock manually only when using `unlock-ddl-lock` command.
   220  // ID: the shard DDL lock ID.
   221  // source, upstreamSchema, upstreamTable: reveal the upstream table's info which we need to skip/exec
   222  // action: whether to skip/exec the blocking DDLs for the specified upstream table
   223  // NOTE: this function has side effects, if it failed, some status can't revert anymore.
   224  // NOTE: this function should not be called if the lock is still in automatic resolving.
   225  func (o *Optimist) UnlockLock(ctx context.Context, id, source, upstreamSchema, upstreamTable string, action pb.UnlockDDLLockOp) error {
   226  	o.mu.Lock()
   227  	defer o.mu.Unlock()
   228  	if o.closed {
   229  		return terror.ErrMasterOptimistNotStarted.Generate()
   230  	}
   231  	task := utils.ExtractTaskFromLockID(id)
   232  	// 1. find the lock.
   233  	lock := o.lk.FindLock(id)
   234  	if lock == nil {
   235  		return terror.ErrMasterLockNotFound.Generate(id)
   236  	}
   237  
   238  	// 2. check whether has resolved before (this often should not happen).
   239  	if lock.IsResolved() {
   240  		_, err := o.removeLock(lock)
   241  		return err
   242  	}
   243  
   244  	// 3. find out related info & operation
   245  	infos, ops, _, err := optimism.GetInfosOperationsByTask(o.cli, task)
   246  	if err != nil {
   247  		return terror.ErrMasterLockIsResolving.Generatef("fail to get info and operation for task %s", task)
   248  	}
   249  	l := 0
   250  	for i, info := range infos {
   251  		if info.Task == task && info.Source == source && info.UpSchema == upstreamSchema && info.UpTable == upstreamTable {
   252  			infos[l] = infos[i]
   253  			l++
   254  		}
   255  	}
   256  	// TODO: change this condition after unlock ddl supports unlock several tables at one time
   257  	if l != 1 {
   258  		return terror.ErrMasterLockIsResolving.Generatef("fail to find related info for lock %s", id)
   259  	}
   260  	infos = infos[:l]
   261  
   262  	l = 0
   263  	for j, op := range ops {
   264  		if op.Task == task && op.Source == source && op.UpSchema == upstreamSchema && op.UpTable == upstreamTable {
   265  			// TODO: adjust waiting for redirect conflict status
   266  			if op.ConflictStage != optimism.ConflictDetected {
   267  				return terror.ErrMasterLockIsResolving.Generatef("lock %s is in %s status, not conflicted", id, op.ConflictStage)
   268  			}
   269  			ops[l] = ops[j]
   270  			l++
   271  		}
   272  	}
   273  	// TODO: change this condition after unlock ddl supports unlock several tables at one time
   274  	if l != 1 {
   275  		return terror.ErrMasterLockIsResolving.Generatef("fail to find related operation for lock %s", id)
   276  	}
   277  	ops = ops[:l]
   278  
   279  	// 4. rewrite operation.DDLs to skip/exec DDLs
   280  	switch action {
   281  	case pb.UnlockDDLLockOp_ExecLock:
   282  		ops[0].DDLs = infos[0].DDLs
   283  	case pb.UnlockDDLLockOp_SkipLock:
   284  		ops[0].DDLs = ops[0].DDLs[:0]
   285  	}
   286  	ops[0].ConflictStage = optimism.ConflictUnlocked
   287  
   288  	// 5. put operation into etcd for workers to execute
   289  	rev, succ, err := optimism.PutOperation(o.cli, false, ops[0], ops[0].Revision+1)
   290  	if err != nil {
   291  		return err
   292  	}
   293  	if action == pb.UnlockDDLLockOp_ExecLock {
   294  		lock.UpdateTableAfterUnlock(infos[0])
   295  	}
   296  	o.logger.Info("put shard DDL lock operation", zap.String("lock", id),
   297  		zap.Stringer("operation", ops[0]), zap.Bool("already exist", !succ), zap.Int64("revision", rev))
   298  	return nil
   299  }
   300  
   301  // RemoveMetaDataWithTask removes meta data for a specified task
   302  // NOTE: this function can only be used when the specified task is not running.
   303  // This function only be used when --remove-meta or stop-task
   304  // NOTE: For stop-task, we still delete drop columns in etcd though user may restart the task again later.
   305  func (o *Optimist) RemoveMetaDataWithTask(task string) error {
   306  	o.mu.Lock()
   307  	defer o.mu.Unlock()
   308  	if o.closed {
   309  		return terror.ErrMasterOptimistNotStarted.Generate()
   310  	}
   311  
   312  	lockIDSet := make(map[string]struct{})
   313  
   314  	infos, ops, _, err := optimism.GetInfosOperationsByTask(o.cli, task)
   315  	if err != nil {
   316  		return err
   317  	}
   318  	for _, info := range infos {
   319  		o.lk.RemoveLockByInfo(info)
   320  		lockIDSet[utils.GenDDLLockID(info.Task, info.DownSchema, info.DownTable)] = struct{}{}
   321  	}
   322  	for _, op := range ops {
   323  		o.lk.RemoveLock(op.ID)
   324  	}
   325  
   326  	o.lk.RemoveDownstreamMeta(task)
   327  	o.tk.RemoveTableByTask(task)
   328  
   329  	// clear meta data in etcd
   330  	_, err = optimism.DeleteInfosOperationsTablesByTask(o.cli, task, lockIDSet)
   331  	return err
   332  }
   333  
   334  // RemoveMetaDataWithTaskAndSources removes meta data for a specified task and sources
   335  // NOTE: this function can only be used when the specified task for source is not running.
   336  func (o *Optimist) RemoveMetaDataWithTaskAndSources(task string, sources ...string) error {
   337  	o.mu.Lock()
   338  	defer o.mu.Unlock()
   339  	if o.closed {
   340  		return terror.ErrMasterOptimistNotStarted.Generate()
   341  	}
   342  
   343  	dropColumns := make(map[string][]string)
   344  
   345  	// gets all locks for this task
   346  	locks := o.lk.FindLocksByTask(task)
   347  	for _, lock := range locks {
   348  		// remove table by sources for related lock
   349  		cols := lock.TryRemoveTableBySources(sources)
   350  		dropColumns[lock.ID] = cols
   351  		o.logger.Debug("the tables removed from the lock", zap.String("task", task), zap.Strings("sources", sources))
   352  		if !lock.HasTables() {
   353  			o.lk.RemoveLock(lock.ID)
   354  		}
   355  	}
   356  
   357  	o.lk.RemoveDownstreamMeta(task)
   358  	// remove source table in table keeper
   359  	o.tk.RemoveTableByTaskAndSources(task, sources)
   360  	o.logger.Debug("the tables removed from the table keeper", zap.String("task", task), zap.Strings("source", sources))
   361  	// clear meta data in etcd
   362  	_, err := optimism.DeleteInfosOperationsTablesByTaskAndSource(o.cli, task, sources, dropColumns)
   363  	return err
   364  }
   365  
   366  // run runs jobs in the background.
   367  func (o *Optimist) run(ctx context.Context, revSource, revInfo, revOperation int64) error {
   368  	for {
   369  		err := o.watchSourceInfoOperation(ctx, revSource, revInfo, revOperation)
   370  		if etcdutil.IsRetryableError(err) {
   371  			retryNum := 0
   372  			for {
   373  				retryNum++
   374  				select {
   375  				case <-ctx.Done():
   376  					return nil
   377  				case <-time.After(500 * time.Millisecond):
   378  					revSource, revInfo, revOperation, err = o.rebuildLocks()
   379  					if err != nil {
   380  						o.logger.Error("fail to rebuild shard DDL lock, will retry",
   381  							zap.Int("retryNum", retryNum), zap.Error(err))
   382  						continue
   383  					}
   384  				}
   385  				break
   386  			}
   387  		} else {
   388  			if err != nil {
   389  				o.logger.Error("non-retryable error occurred, optimist will quite now", zap.Error(err))
   390  			}
   391  			return err
   392  		}
   393  	}
   394  }
   395  
   396  // rebuildLocks rebuilds shard DDL locks from etcd persistent data.
   397  func (o *Optimist) rebuildLocks() (revSource, revInfo, revOperation int64, err error) {
   398  	o.lk.Clear() // clear all previous locks to support re-Start.
   399  
   400  	// get the history & initial source tables.
   401  	stm, revSource, err := optimism.GetAllSourceTables(o.cli)
   402  	if err != nil {
   403  		return 0, 0, 0, err
   404  	}
   405  	// we do not log `stm`, `ifm` and `opm` now, because they may too long in optimism mode.
   406  	o.logger.Info("get history initial source tables", zap.Int64("revision", revSource))
   407  	o.tk.Init(stm) // re-initialize again with valid tables.
   408  
   409  	// get the history shard DDL info.
   410  	ifm, revInfo, err := optimism.GetAllInfo(o.cli)
   411  	if err != nil {
   412  		return 0, 0, 0, err
   413  	}
   414  	o.logger.Info("get history shard DDL info", zap.Int64("revision", revInfo))
   415  
   416  	// get the history shard DDL lock operation.
   417  	// the newly operations after this GET will be received through the WATCH with `revOperation+1`,
   418  	opm, revOperation, err := optimism.GetAllOperations(o.cli)
   419  	if err != nil {
   420  		return 0, 0, 0, err
   421  	}
   422  	o.logger.Info("get history shard DDL lock operation", zap.Int64("revision", revOperation))
   423  
   424  	colm, _, err := optimism.GetAllDroppedColumns(o.cli)
   425  	if err != nil {
   426  		// only log the error, and don't return it to forbid the startup of the DM-master leader.
   427  		// then these unexpected columns can be handled by the user.
   428  		o.logger.Error("fail to recover colms", log.ShortError(err))
   429  	}
   430  	o.lk.SetDropColumns(colm)
   431  
   432  	// recover the shard DDL lock based on history shard DDL info & lock operation.
   433  	err = o.recoverLocks(ifm, opm)
   434  	if err != nil {
   435  		// only log the error, and don't return it to forbid the startup of the DM-master leader.
   436  		// then these unexpected locks can be handled by the user.
   437  		o.logger.Error("fail to recover locks", log.ShortError(err))
   438  	}
   439  	o.lk.SetDropColumns(nil)
   440  
   441  	return revSource, revInfo, revOperation, nil
   442  }
   443  
   444  // sortInfos sort all infos by revision.
   445  func sortInfos(ifm map[string]map[string]map[string]map[string]optimism.Info) []optimism.Info {
   446  	infos := make([]optimism.Info, 0, len(ifm))
   447  
   448  	for _, ifTask := range ifm {
   449  		for _, ifSource := range ifTask {
   450  			for _, ifSchema := range ifSource {
   451  				for _, info := range ifSchema {
   452  					infos = append(infos, info)
   453  				}
   454  			}
   455  		}
   456  	}
   457  
   458  	// sort according to the Revision
   459  	sort.Slice(infos, func(i, j int) bool {
   460  		return infos[i].Revision < infos[j].Revision
   461  	})
   462  	return infos
   463  }
   464  
   465  // recoverLocks recovers shard DDL locks based on shard DDL info and shard DDL lock operation.
   466  func (o *Optimist) recoverLocks(
   467  	ifm map[string]map[string]map[string]map[string]optimism.Info,
   468  	opm map[string]map[string]map[string]map[string]optimism.Operation,
   469  ) error {
   470  	// sort infos by revision
   471  	infos := sortInfos(ifm)
   472  	var firstErr error
   473  	setFirstErr := func(err error) {
   474  		if firstErr == nil && err != nil {
   475  			firstErr = err
   476  		}
   477  	}
   478  
   479  	for _, info := range infos {
   480  		if info.IsDeleted {
   481  			// TODO: handle drop table
   482  			continue
   483  		}
   484  		if !o.tk.SourceTableExist(info.Task, info.Source, info.UpSchema, info.UpTable, info.DownSchema, info.DownTable) {
   485  			continue
   486  		}
   487  		// never mark the lock operation from `done` to `not-done` when recovering.
   488  		err := o.handleInfo(info, true)
   489  		if err != nil {
   490  			o.logger.Error("fail to handle info while recovering locks", zap.Error(err))
   491  			setFirstErr(err)
   492  		}
   493  	}
   494  
   495  	// update the done status of the lock.
   496  	for _, opTask := range opm {
   497  		for _, opSource := range opTask {
   498  			for _, opSchema := range opSource {
   499  				for _, op := range opSchema {
   500  					lock := o.lk.FindLock(op.ID)
   501  					if lock == nil {
   502  						o.logger.Warn("lock for the operation not found", zap.Stringer("operation", op))
   503  						continue
   504  					}
   505  					if op.Done {
   506  						lock.TryMarkDone(op.Source, op.UpSchema, op.UpTable)
   507  						err := lock.DeleteColumnsByOp(op)
   508  						if err != nil {
   509  							o.logger.Error("fail to update lock columns", zap.Error(err))
   510  						}
   511  						// should remove resolved lock or it will be kept until next DDL
   512  						if lock.IsResolved() {
   513  							o.removeLockOptional(op, lock)
   514  						}
   515  					}
   516  				}
   517  			}
   518  		}
   519  	}
   520  	return firstErr
   521  }
   522  
   523  // watchSourceInfoOperation watches the etcd operation for source tables, shard DDL infos and shard DDL operations.
   524  func (o *Optimist) watchSourceInfoOperation(
   525  	pCtx context.Context, revSource, revInfo, revOperation int64,
   526  ) error {
   527  	ctx, cancel := context.WithCancel(pCtx)
   528  	var wg sync.WaitGroup
   529  	defer func() {
   530  		cancel()
   531  		wg.Wait()
   532  	}()
   533  
   534  	errCh := make(chan error, 10)
   535  
   536  	// watch for source tables and handle them.
   537  	sourceCh := make(chan optimism.SourceTables, 10)
   538  	wg.Add(2)
   539  	go func() {
   540  		defer func() {
   541  			wg.Done()
   542  			close(sourceCh)
   543  		}()
   544  		optimism.WatchSourceTables(ctx, o.cli, revSource+1, sourceCh, errCh)
   545  	}()
   546  	go func() {
   547  		defer wg.Done()
   548  		o.handleSourceTables(ctx, sourceCh)
   549  	}()
   550  
   551  	// watch for the shard DDL info and handle them.
   552  	infoCh := make(chan optimism.Info, 10)
   553  	wg.Add(2)
   554  	go func() {
   555  		defer func() {
   556  			wg.Done()
   557  			close(infoCh)
   558  		}()
   559  		optimism.WatchInfo(ctx, o.cli, revInfo+1, infoCh, errCh)
   560  	}()
   561  	go func() {
   562  		defer wg.Done()
   563  		o.handleInfoPut(ctx, infoCh)
   564  	}()
   565  
   566  	// watch for the shard DDL lock operation and handle them.
   567  	opCh := make(chan optimism.Operation, 10)
   568  	wg.Add(2)
   569  	go func() {
   570  		defer func() {
   571  			wg.Done()
   572  			close(opCh)
   573  		}()
   574  		optimism.WatchOperationPut(ctx, o.cli, "", "", "", "", revOperation+1, opCh, errCh)
   575  	}()
   576  	go func() {
   577  		defer wg.Done()
   578  		o.handleOperationPut(ctx, opCh)
   579  	}()
   580  
   581  	select {
   582  	case err := <-errCh:
   583  		return err
   584  	case <-pCtx.Done():
   585  		return nil
   586  	}
   587  }
   588  
   589  // handleSourceTables handles PUT and DELETE for source tables.
   590  func (o *Optimist) handleSourceTables(ctx context.Context, sourceCh <-chan optimism.SourceTables) {
   591  	for {
   592  		select {
   593  		case <-ctx.Done():
   594  			return
   595  		case st, ok := <-sourceCh:
   596  			if !ok {
   597  				return
   598  			}
   599  			o.mu.Lock()
   600  			addedTable, droppedTable := o.tk.Update(st)
   601  			// handle create table
   602  			for routeTable := range addedTable {
   603  				lock := o.lk.FindLock(utils.GenDDLLockID(st.Task, routeTable.DownSchema, routeTable.DownTable))
   604  				if lock != nil {
   605  					lock.AddTable(st.Source, routeTable.UpSchema, routeTable.UpTable, true)
   606  				}
   607  			}
   608  			// handle drop table
   609  			for routeTable := range droppedTable {
   610  				lock := o.lk.FindLock(utils.GenDDLLockID(st.Task, routeTable.DownSchema, routeTable.DownTable))
   611  				if lock != nil {
   612  					cols := lock.TryRemoveTable(st.Source, routeTable.UpSchema, routeTable.UpTable)
   613  					if !lock.HasTables() {
   614  						o.lk.RemoveLock(lock.ID)
   615  					}
   616  					_, err := optimism.DeleteInfosOperationsTablesByTable(o.cli, st.Task, st.Source, routeTable.UpSchema, routeTable.UpTable, lock.ID, cols)
   617  					if err != nil {
   618  						o.logger.Error("failed to delete etcd meta data for table", zap.String("lockID", lock.ID), zap.String("schema", routeTable.UpSchema), zap.String("table", routeTable.UpTable))
   619  					}
   620  				}
   621  			}
   622  			o.mu.Unlock()
   623  		}
   624  	}
   625  }
   626  
   627  // handleInfoPut handles PUT and DELETE for the shard DDL info.
   628  func (o *Optimist) handleInfoPut(ctx context.Context, infoCh <-chan optimism.Info) {
   629  	for {
   630  		select {
   631  		case <-ctx.Done():
   632  			return
   633  		case info, ok := <-infoCh:
   634  			if !ok {
   635  				return
   636  			}
   637  			o.logger.Info("receive a shard DDL info", zap.Stringer("info", info), zap.Bool("is deleted", info.IsDeleted))
   638  
   639  			if info.IsDeleted {
   640  				// this often happen after the lock resolved.
   641  				continue
   642  			}
   643  
   644  			// avoid new ddl added while previous ddl resolved and remove lock
   645  			// change lock granularity if needed
   646  			o.mu.Lock()
   647  			// put operation for the table. we don't set `skipDone=true` now,
   648  			// because in optimism mode, one table may execute/done multiple DDLs but other tables may do nothing.
   649  			_ = o.handleInfo(info, false)
   650  			o.mu.Unlock()
   651  		}
   652  	}
   653  }
   654  
   655  func (o *Optimist) handleInfo(info optimism.Info, skipDone bool) error {
   656  	added := o.tk.AddTable(info.Task, info.Source, info.UpSchema, info.UpTable, info.DownSchema, info.DownTable)
   657  	o.logger.Debug("a table added for info", zap.Bool("added", added), zap.String("info", info.ShortString()))
   658  
   659  	tts := o.tk.FindTables(info.Task, info.DownSchema, info.DownTable)
   660  	if tts == nil {
   661  		// WATCH for SourceTables may fall behind WATCH for Info although PUT earlier,
   662  		// so we try to get SourceTables again.
   663  		// NOTE: check SourceTables for `info.Source` if needed later.
   664  		stm, _, err := optimism.GetAllSourceTables(o.cli)
   665  		if err != nil {
   666  			o.logger.Error("fail to get source tables", log.ShortError(err))
   667  		} else if tts2 := optimism.TargetTablesForTask(info.Task, info.DownSchema, info.DownTable, stm); tts2 != nil {
   668  			tts = tts2
   669  		}
   670  	}
   671  	err := o.handleLock(info, tts, skipDone)
   672  	if err != nil {
   673  		o.logger.Error("fail to handle the shard DDL lock", zap.String("info", info.ShortString()), log.ShortError(err))
   674  		metrics.ReportDDLError(info.Task, metrics.InfoErrHandleLock)
   675  	}
   676  	return err
   677  }
   678  
   679  // handleOperationPut handles PUT for the shard DDL lock operations.
   680  func (o *Optimist) handleOperationPut(ctx context.Context, opCh <-chan optimism.Operation) {
   681  	for {
   682  		select {
   683  		case <-ctx.Done():
   684  			return
   685  		case op, ok := <-opCh:
   686  			if !ok {
   687  				return
   688  			}
   689  			o.logger.Info("receive a shard DDL lock operation", zap.Stringer("operation", op))
   690  			if !op.Done {
   691  				o.logger.Info("the shard DDL lock operation has not done", zap.Stringer("operation", op))
   692  				continue
   693  			}
   694  
   695  			// avoid new ddl added while previous ddl resolved and remove lock
   696  			// change lock granularity if needed
   697  			o.mu.Lock()
   698  			o.handleOperation(op)
   699  			o.mu.Unlock()
   700  		}
   701  	}
   702  }
   703  
   704  func (o *Optimist) handleOperation(op optimism.Operation) {
   705  	lock := o.lk.FindLock(op.ID)
   706  	if lock == nil {
   707  		o.logger.Warn("no lock for the shard DDL lock operation exist", zap.Stringer("operation", op))
   708  		return
   709  	}
   710  
   711  	err := lock.DeleteColumnsByOp(op)
   712  	if err != nil {
   713  		o.logger.Error("fail to update lock columns", zap.Error(err))
   714  	}
   715  	// in optimistic mode, we always try to mark a table as done after received the `done` status of the DDLs operation.
   716  	// NOTE: even all tables have done their previous DDLs operations, the lock may still not resolved,
   717  	// because these tables may have different schemas.
   718  	done := lock.TryMarkDone(op.Source, op.UpSchema, op.UpTable)
   719  	o.logger.Info("mark operation for a table as done", zap.Bool("done", done), zap.Stringer("operation", op))
   720  	if !lock.IsResolved() {
   721  		o.logger.Info("the lock is still not resolved", zap.Stringer("operation", op))
   722  		return
   723  	}
   724  	o.removeLockOptional(op, lock)
   725  }
   726  
   727  func (o *Optimist) removeLockOptional(op optimism.Operation, lock *optimism.Lock) {
   728  	// the lock has done, remove the lock.
   729  	o.logger.Info("the lock for the shard DDL lock operation has been resolved", zap.Stringer("operation", op))
   730  	deleted, err := o.removeLock(lock)
   731  	if err != nil {
   732  		o.logger.Error("fail to delete the shard DDL infos and lock operations", zap.String("lock", lock.ID), log.ShortError(err))
   733  		metrics.ReportDDLError(op.Task, metrics.OpErrRemoveLock)
   734  	}
   735  	if deleted {
   736  		o.logger.Info("the shard DDL infos and lock operations have been cleared", zap.Stringer("operation", op))
   737  	}
   738  }
   739  
   740  // handleLock handles a single shard DDL lock.
   741  func (o *Optimist) handleLock(info optimism.Info, tts []optimism.TargetTable, skipDone bool) error {
   742  	var (
   743  		cfStage = optimism.ConflictNone
   744  		cfMsg   = ""
   745  	)
   746  
   747  	lockID, newDDLs, cols, err := o.lk.TrySync(o.cli, info, tts)
   748  	switch {
   749  	case info.IgnoreConflict:
   750  		o.logger.Warn("error occur when trying to sync for shard DDL info, this often means shard DDL conflict detected",
   751  			zap.String("lock", lockID), zap.String("info", info.ShortString()), zap.Bool("is deleted", info.IsDeleted), log.ShortError(err))
   752  	case err != nil:
   753  		switch {
   754  		case terror.ErrShardDDLOptimismNeedSkipAndRedirect.Equal(err):
   755  			cfStage = optimism.ConflictSkipWaitRedirect
   756  			cfMsg = err.Error()
   757  			o.logger.Warn("Please make sure all sharding tables execute this DDL in order", log.ShortError(err))
   758  		case terror.ErrShardDDLOptimismTrySyncFail.Equal(err):
   759  			cfStage = optimism.ConflictDetected
   760  			cfMsg = err.Error()
   761  			o.logger.Warn("conflict occur when trying to sync for shard DDL info, this often means shard DDL conflict detected",
   762  				zap.String("lock", lockID), zap.String("info", info.ShortString()), zap.Bool("is deleted", info.IsDeleted), log.ShortError(err))
   763  		default:
   764  			cfStage = optimism.ConflictError // we treat any errors returned from `TrySync` as conflict detected now.
   765  			cfMsg = err.Error()
   766  			o.logger.Warn("error occur when trying to sync for shard DDL info, this often means shard DDL error happened",
   767  				zap.String("lock", lockID), zap.String("info", info.ShortString()), zap.Bool("is deleted", info.IsDeleted), log.ShortError(err))
   768  		}
   769  	default:
   770  		o.logger.Info("the shard DDL lock returned some DDLs",
   771  			zap.String("lock", lockID), zap.Strings("ddls", newDDLs), zap.Strings("cols", cols), zap.String("info", info.ShortString()), zap.Bool("is deleted", info.IsDeleted))
   772  	}
   773  
   774  	lock := o.lk.FindLock(lockID)
   775  	if lock == nil {
   776  		// should not happen
   777  		return terror.ErrMasterLockNotFound.Generate(lockID)
   778  	}
   779  
   780  	// check whether the lock has resolved.
   781  	if lock.IsResolved() {
   782  		// remove all operations for this shard DDL lock.
   783  		// this is to handle the case where dm-master exit before deleting operations for them.
   784  		_, err = o.removeLock(lock)
   785  		if err != nil {
   786  			return err
   787  		}
   788  		return nil
   789  	}
   790  
   791  	if info.IgnoreConflict {
   792  		return nil
   793  	}
   794  
   795  	op := optimism.NewOperation(lockID, lock.Task, info.Source, info.UpSchema, info.UpTable, newDDLs, cfStage, cfMsg, false, cols)
   796  	rev, succ, err := optimism.PutOperation(o.cli, skipDone, op, info.Revision)
   797  	if err != nil {
   798  		return err
   799  	}
   800  	o.logger.Info("put shard DDL lock operation", zap.String("lock", lockID),
   801  		zap.Stringer("operation", op), zap.Bool("already exist", !succ), zap.Int64("revision", rev))
   802  	return nil
   803  }
   804  
   805  // removeLock removes the lock in memory and its information in etcd.
   806  func (o *Optimist) removeLock(lock *optimism.Lock) (bool, error) {
   807  	failpoint.Inject("SleepWhenRemoveLock", func(val failpoint.Value) {
   808  		t := val.(int)
   809  		log.L().Info("wait new ddl info putted into etcd in optimistic",
   810  			zap.String("failpoint", "SleepWhenRemoveLock"),
   811  			zap.Int("max wait second", t))
   812  
   813  		ticker := time.NewTicker(time.Second)
   814  		defer ticker.Stop()
   815  		timer := time.NewTimer(time.Duration(t) * time.Second)
   816  		defer timer.Stop()
   817  	OUTER:
   818  		for {
   819  			select {
   820  			case <-timer.C:
   821  				log.L().Info("failed to wait new DDL info", zap.Int("wait second", t))
   822  				break OUTER
   823  			case <-ticker.C:
   824  				// manually check etcd
   825  				cmps := make([]clientv3.Cmp, 0)
   826  				for source, schemaTables := range lock.Ready() {
   827  					for schema, tables := range schemaTables {
   828  						for table := range tables {
   829  							info := optimism.NewInfo(lock.Task, source, schema, table, lock.DownSchema, lock.DownTable, nil, nil, nil)
   830  							info.Version = lock.GetVersion(source, schema, table)
   831  							key := common.ShardDDLOptimismInfoKeyAdapter.Encode(info.Task, info.Source, info.UpSchema, info.UpTable)
   832  							cmps = append(cmps, clientv3.Compare(clientv3.Version(key), "<", info.Version+1))
   833  						}
   834  					}
   835  				}
   836  				resp, _, err := etcdutil.DoTxnWithRepeatable(o.cli, etcdutil.FullOpFunc(cmps, nil, nil))
   837  				if err == nil && !resp.Succeeded {
   838  					log.L().Info("found new DDL info")
   839  					break OUTER
   840  				}
   841  			}
   842  		}
   843  	})
   844  	deleted, err := o.deleteInfosOps(lock)
   845  	if err != nil {
   846  		return deleted, err
   847  	}
   848  	if !deleted {
   849  		return false, nil
   850  	}
   851  	o.lk.RemoveLock(lock.ID)
   852  	metrics.ReportDDLPending(lock.Task, metrics.DDLPendingSynced, metrics.DDLPendingNone)
   853  	return true, nil
   854  }
   855  
   856  // deleteInfosOps DELETEs shard DDL lock info and operations.
   857  func (o *Optimist) deleteInfosOps(lock *optimism.Lock) (bool, error) {
   858  	infos := make([]optimism.Info, 0)
   859  	ops := make([]optimism.Operation, 0)
   860  	for source, schemaTables := range lock.Ready() {
   861  		for schema, tables := range schemaTables {
   862  			for table := range tables {
   863  				// NOTE: we rely on only `task`, `source`, `upSchema`, `upTable` and `Version` used for deletion.
   864  				info := optimism.NewInfo(lock.Task, source, schema, table, lock.DownSchema, lock.DownTable, nil, nil, nil)
   865  				info.Version = lock.GetVersion(source, schema, table)
   866  				infos = append(infos, info)
   867  				ops = append(ops, optimism.NewOperation(lock.ID, lock.Task, source, schema, table, nil, optimism.ConflictNone, "", false, nil))
   868  			}
   869  		}
   870  	}
   871  	// NOTE: we rely on only `task`, `downSchema`, and `downTable` used for deletion.
   872  	rev, deleted, err := optimism.DeleteInfosOperationsColumns(o.cli, infos, ops, lock.ID)
   873  	if err != nil {
   874  		return deleted, err
   875  	}
   876  	if deleted {
   877  		o.logger.Info("delete shard DDL infos and lock operations", zap.String("lock", lock.ID), zap.Int64("revision", rev))
   878  	} else {
   879  		o.logger.Info("fail to delete shard DDL infos and lock operations", zap.String("lock", lock.ID), zap.Int64("revision", rev))
   880  	}
   881  	return deleted, nil
   882  }