github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/syncer/syncer.go

github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/syncer/syncer.go (about)

     1  // Copyright 2019 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package syncer
    15  
    16  import (
    17  	"bytes"
    18  	"context"
    19  	"fmt"
    20  	"math"
    21  	"os"
    22  	"path"
    23  	"reflect"
    24  	"strconv"
    25  	"strings"
    26  	"sync"
    27  	"time"
    28  
    29  	"github.com/go-mysql-org/go-mysql/mysql"
    30  	"github.com/go-mysql-org/go-mysql/replication"
    31  	mysql2 "github.com/go-sql-driver/mysql"
    32  	"github.com/pingcap/errors"
    33  	"github.com/pingcap/failpoint"
    34  	tidbddl "github.com/pingcap/tidb/pkg/ddl"
    35  	"github.com/pingcap/tidb/pkg/parser"
    36  	"github.com/pingcap/tidb/pkg/parser/ast"
    37  	"github.com/pingcap/tidb/pkg/parser/model"
    38  	"github.com/pingcap/tidb/pkg/sessionctx"
    39  	"github.com/pingcap/tidb/pkg/util/dbutil"
    40  	"github.com/pingcap/tidb/pkg/util/filter"
    41  	regexprrouter "github.com/pingcap/tidb/pkg/util/regexpr-router"
    42  	router "github.com/pingcap/tidb/pkg/util/table-router"
    43  	"github.com/pingcap/tiflow/dm/config"
    44  	"github.com/pingcap/tiflow/dm/config/dbconfig"
    45  	"github.com/pingcap/tiflow/dm/pb"
    46  	"github.com/pingcap/tiflow/dm/pkg/binlog"
    47  	"github.com/pingcap/tiflow/dm/pkg/binlog/event"
    48  	"github.com/pingcap/tiflow/dm/pkg/binlog/reader"
    49  	"github.com/pingcap/tiflow/dm/pkg/conn"
    50  	tcontext "github.com/pingcap/tiflow/dm/pkg/context"
    51  	fr "github.com/pingcap/tiflow/dm/pkg/func-rollback"
    52  	"github.com/pingcap/tiflow/dm/pkg/gtid"
    53  	"github.com/pingcap/tiflow/dm/pkg/ha"
    54  	"github.com/pingcap/tiflow/dm/pkg/log"
    55  	parserpkg "github.com/pingcap/tiflow/dm/pkg/parser"
    56  	"github.com/pingcap/tiflow/dm/pkg/schema"
    57  	"github.com/pingcap/tiflow/dm/pkg/shardddl/optimism"
    58  	"github.com/pingcap/tiflow/dm/pkg/shardddl/pessimism"
    59  	"github.com/pingcap/tiflow/dm/pkg/storage"
    60  	"github.com/pingcap/tiflow/dm/pkg/streamer"
    61  	"github.com/pingcap/tiflow/dm/pkg/terror"
    62  	"github.com/pingcap/tiflow/dm/pkg/utils"
    63  	"github.com/pingcap/tiflow/dm/relay"
    64  	"github.com/pingcap/tiflow/dm/syncer/binlogstream"
    65  	"github.com/pingcap/tiflow/dm/syncer/dbconn"
    66  	"github.com/pingcap/tiflow/dm/syncer/metrics"
    67  	onlineddl "github.com/pingcap/tiflow/dm/syncer/online-ddl-tools"
    68  	sm "github.com/pingcap/tiflow/dm/syncer/safe-mode"
    69  	"github.com/pingcap/tiflow/dm/syncer/shardddl"
    70  	"github.com/pingcap/tiflow/dm/unit"
    71  	bf "github.com/pingcap/tiflow/pkg/binlog-filter"
    72  	"github.com/pingcap/tiflow/pkg/errorutil"
    73  	"github.com/pingcap/tiflow/pkg/sqlmodel"
    74  	clientv3 "go.etcd.io/etcd/client/v3"
    75  	"go.uber.org/atomic"
    76  	"go.uber.org/zap"
    77  )
    78  
    79  var (
    80  	waitTime = 10 * time.Millisecond
    81  
    82  	// MaxDDLConnectionTimeoutMinute also used by SubTask.ExecuteDDL.
    83  	MaxDDLConnectionTimeoutMinute = 5
    84  
    85  	maxDMLConnectionTimeout = "5m"
    86  	maxDDLConnectionTimeout = fmt.Sprintf("%dm", MaxDDLConnectionTimeoutMinute)
    87  
    88  	maxDMLConnectionDuration, _ = time.ParseDuration(maxDMLConnectionTimeout)
    89  
    90  	defaultMaxPauseOrStopWaitTime = 10 * time.Second
    91  
    92  	adminQueueName     = "admin queue"
    93  	defaultBucketCount = 8
    94  )
    95  
    96  const (
    97  	skipJobIdx = iota
    98  	ddlJobIdx
    99  	workerJobTSArrayInitSize // size = skip + ddl
   100  )
   101  
   102  // waitXIDStatus represents the status for waiting XID event when pause/stop task.
   103  type waitXIDStatus int64
   104  
   105  const (
   106  	noWait waitXIDStatus = iota
   107  	waiting
   108  	waitComplete
   109  )
   110  
   111  // Syncer can sync your MySQL data to another MySQL database.
   112  type Syncer struct {
   113  	sync.RWMutex
   114  
   115  	tctx *tcontext.Context // this ctx only used for logger.
   116  
   117  	// this ctx derives from a background ctx and was initialized in s.Run, it is used for some background tasks in s.Run
   118  	// when this ctx cancelled, syncer will shutdown all background running jobs (except the syncDML and syncDDL) and not wait transaction end.
   119  	runCtx    *tcontext.Context
   120  	runCancel context.CancelFunc
   121  	// this ctx only used for syncDML and syncDDL and only cancelled when ungraceful stop.
   122  	syncCtx    *tcontext.Context
   123  	syncCancel context.CancelFunc
   124  	// control all goroutines that started in S.Run
   125  	runWg sync.WaitGroup
   126  
   127  	cfg            *config.SubTaskConfig
   128  	syncCfg        replication.BinlogSyncerConfig
   129  	cliArgs        *config.TaskCliArgs
   130  	metricsProxies *metrics.Proxies
   131  
   132  	sgk  *ShardingGroupKeeper    // keeper to keep all sharding (sub) group in this syncer
   133  	osgk *OptShardingGroupKeeper // optimistic ddl's keeper to keep all sharding (sub) group in this syncer
   134  
   135  	pessimist *shardddl.Pessimist // shard DDL pessimist
   136  	optimist  *shardddl.Optimist  // shard DDL optimist
   137  	cli       *clientv3.Client
   138  
   139  	binlogType         binlogstream.BinlogType
   140  	streamerController *binlogstream.StreamerController
   141  
   142  	jobWg sync.WaitGroup // counts ddl/flush/asyncFlush job in-flight in s.dmlJobCh and s.ddlJobCh
   143  
   144  	schemaTracker *schema.Tracker
   145  
   146  	fromDB   *dbconn.UpStreamConn
   147  	fromConn *dbconn.DBConn
   148  
   149  	toDB                *conn.BaseDB
   150  	toDBConns           []*dbconn.DBConn
   151  	ddlDB               *conn.BaseDB
   152  	ddlDBConn           *dbconn.DBConn
   153  	downstreamTrackConn *dbconn.DBConn
   154  
   155  	dmlJobCh            chan *job
   156  	ddlJobCh            chan *job
   157  	jobsClosed          atomic.Bool
   158  	jobsChanLock        sync.Mutex
   159  	waitXIDJob          atomic.Int64
   160  	isTransactionEnd    bool
   161  	waitTransactionLock sync.Mutex
   162  
   163  	tableRouter     *regexprrouter.RouteTable
   164  	binlogFilter    *bf.BinlogEvent
   165  	baList          *filter.Filter
   166  	exprFilterGroup *ExprFilterGroup
   167  	sessCtx         sessionctx.Context
   168  
   169  	running atomic.Bool
   170  	closed  atomic.Bool
   171  
   172  	start    atomic.Time
   173  	lastTime atomic.Time
   174  
   175  	// safeMode is used to track if we need to generate dml with safe-mode
   176  	// For each binlog event, we will set the current value into eventContext because
   177  	// the status of this track may change over time.
   178  	safeMode *sm.SafeMode
   179  
   180  	upstreamTZ       *time.Location
   181  	timezone         *time.Location
   182  	timezoneLastTime string
   183  	upstreamTZStr    string
   184  
   185  	binlogSizeCount     atomic.Int64
   186  	lastBinlogSizeCount atomic.Int64
   187  
   188  	lastCount atomic.Int64
   189  	count     atomic.Int64
   190  	totalRps  atomic.Int64
   191  	rps       atomic.Int64
   192  
   193  	filteredInsert atomic.Int64
   194  	filteredUpdate atomic.Int64
   195  	filteredDelete atomic.Int64
   196  
   197  	checkpoint            CheckPoint
   198  	checkpointFlushWorker *checkpointFlushWorker
   199  	onlineDDL             onlineddl.OnlinePlugin
   200  
   201  	// record process error rather than log.Fatal
   202  	runFatalChan chan *pb.ProcessError
   203  	// record whether error occurred when execute SQLs
   204  	execError atomic.Error
   205  
   206  	readerHub              *streamer.ReaderHub
   207  	recordedActiveRelayLog bool
   208  
   209  	currentLocationMu struct {
   210  		sync.RWMutex
   211  		currentLocation binlog.Location // use to calc remain binlog size
   212  	}
   213  
   214  	errLocation struct {
   215  		sync.RWMutex
   216  		startLocation *binlog.Location
   217  		endLocation   *binlog.Location
   218  		isQueryEvent  bool
   219  	}
   220  
   221  	cutOverLocation atomic.Pointer[binlog.Location]
   222  
   223  	handleJobFunc func(*job) (bool, error)
   224  	flushSeq      int64
   225  
   226  	// `lower_case_table_names` setting of upstream db
   227  	SourceTableNamesFlavor conn.LowerCaseTableNamesFlavor
   228  
   229  	// time difference between upstream and DM nodes: time of DM - time of upstream.
   230  	// we use this to calculate replication lag more accurately when clock is not synced
   231  	// on either upstream or DM nodes.
   232  	tsOffset atomic.Int64
   233  	// this field measures the time difference in seconds between current time of DM and
   234  	// the minimal timestamp of currently processing binlog events.
   235  	// this lag will consider time difference between upstream and DM nodes
   236  	secondsBehindMaster atomic.Int64
   237  	// stores the last job TS(binlog event timestamp) of each worker,
   238  	// if there's no active job, the corresponding worker's TS is reset to 0.
   239  	// since DML worker runs jobs in batch, the TS is the TS of the first job in the batch.
   240  	// We account for skipped events too, if the distance between up and downstream is long,
   241  	// and there's no interested binlog event in between, we can have a decreasing lag.
   242  	// 	- 0 is for skip jobs
   243  	// 	- 1 is ddl worker
   244  	// 	- 2+ is for DML worker job idx=(queue id + 2)
   245  	workerJobTSArray          []*atomic.Int64
   246  	lastCheckpointFlushedTime time.Time
   247  
   248  	firstMeetBinlogTS *int64
   249  	exitSafeModeTS    *int64 // TS(in binlog header) need to exit safe mode.
   250  
   251  	// initial executed binlog location, set once for each instance of syncer.
   252  	initExecutedLoc *binlog.Location
   253  
   254  	relay                      relay.Process
   255  	charsetAndDefaultCollation map[string]string
   256  	idAndCollationMap          map[int]string
   257  
   258  	ddlWorker *DDLWorker
   259  }
   260  
   261  // NewSyncer creates a new Syncer.
   262  func NewSyncer(cfg *config.SubTaskConfig, etcdClient *clientv3.Client, relay relay.Process) *Syncer {
   263  	logFields := []zap.Field{
   264  		zap.String("task", cfg.Name),
   265  		zap.String("unit", "binlog replication"),
   266  	}
   267  	var logger log.Logger
   268  	if cfg.FrameworkLogger != nil {
   269  		logger = log.Logger{Logger: cfg.FrameworkLogger.With(logFields...)}
   270  	} else {
   271  		logger = log.With(logFields...)
   272  	}
   273  
   274  	syncer := &Syncer{
   275  		pessimist: shardddl.NewPessimist(&logger, etcdClient, cfg.Name, cfg.SourceID),
   276  		optimist:  shardddl.NewOptimist(&logger, etcdClient, cfg.Name, cfg.SourceID),
   277  	}
   278  	syncer.cfg = cfg
   279  	syncer.tctx = tcontext.Background().WithLogger(logger)
   280  	syncer.jobsClosed.Store(true) // not open yet
   281  	syncer.waitXIDJob.Store(int64(noWait))
   282  	syncer.isTransactionEnd = true
   283  	syncer.closed.Store(false)
   284  	syncer.lastBinlogSizeCount.Store(0)
   285  	syncer.binlogSizeCount.Store(0)
   286  	syncer.lastCount.Store(0)
   287  	syncer.count.Store(0)
   288  	syncer.handleJobFunc = syncer.handleJob
   289  	syncer.cli = etcdClient
   290  
   291  	syncer.checkpoint = NewRemoteCheckPoint(syncer.tctx, cfg, syncer.metricsProxies, syncer.checkpointID())
   292  
   293  	syncer.binlogType = binlogstream.RelayToBinlogType(relay)
   294  	syncer.readerHub = streamer.GetReaderHub()
   295  
   296  	if cfg.ShardMode == config.ShardPessimistic {
   297  		// only need to sync DDL in sharding mode
   298  		syncer.sgk = NewShardingGroupKeeper(syncer.tctx, cfg, syncer.metricsProxies)
   299  	} else if cfg.ShardMode == config.ShardOptimistic {
   300  		syncer.osgk = NewOptShardingGroupKeeper(syncer.tctx, cfg)
   301  	}
   302  	syncer.recordedActiveRelayLog = false
   303  	syncer.workerJobTSArray = make([]*atomic.Int64, cfg.WorkerCount+workerJobTSArrayInitSize)
   304  	for i := range syncer.workerJobTSArray {
   305  		syncer.workerJobTSArray[i] = atomic.NewInt64(0)
   306  	}
   307  	syncer.lastCheckpointFlushedTime = time.Time{}
   308  	syncer.relay = relay
   309  	syncer.safeMode = sm.NewSafeMode()
   310  
   311  	return syncer
   312  }
   313  
   314  func (s *Syncer) refreshCliArgs() {
   315  	if s.cli == nil {
   316  		// for dummy syncer in ut
   317  		return
   318  	}
   319  	cliArgs, err := ha.GetTaskCliArgs(s.cli, s.cfg.Name, s.cfg.SourceID)
   320  	if err != nil {
   321  		s.tctx.L().Error("failed to get task cli args", zap.Error(err))
   322  	}
   323  	s.Lock()
   324  	s.cliArgs = cliArgs
   325  	s.Unlock()
   326  }
   327  
   328  func (s *Syncer) newJobChans() {
   329  	chanSize := calculateChanSize(s.cfg.QueueSize, s.cfg.WorkerCount, s.cfg.Compact)
   330  	s.dmlJobCh = make(chan *job, chanSize)
   331  	s.ddlJobCh = make(chan *job, s.cfg.QueueSize)
   332  	s.jobsClosed.Store(false)
   333  }
   334  
   335  func (s *Syncer) closeJobChans() {
   336  	s.jobsChanLock.Lock()
   337  	defer s.jobsChanLock.Unlock()
   338  	if s.jobsClosed.Load() {
   339  		return
   340  	}
   341  	close(s.dmlJobCh)
   342  	close(s.ddlJobCh)
   343  	s.jobsClosed.Store(true)
   344  }
   345  
   346  // Type implements Unit.Type.
   347  func (s *Syncer) Type() pb.UnitType {
   348  	return pb.UnitType_Sync
   349  }
   350  
   351  // Init initializes syncer for a sync task, but not start Process.
   352  // if fail, it should not call s.Close.
   353  // some check may move to checker later.
   354  func (s *Syncer) Init(ctx context.Context) (err error) {
   355  	failpoint.Inject("IOTotalBytes", func(val failpoint.Value) {
   356  		c := atomic.NewUint64(0)
   357  		s.cfg.UUID = val.(string)
   358  		s.cfg.IOTotalBytes = c
   359  
   360  		go func() {
   361  			for {
   362  				s.tctx.L().Debug("IOTotalBytes", zap.Uint64("IOTotalBytes", s.cfg.IOTotalBytes.Load()))
   363  				time.Sleep(10 * time.Second)
   364  			}
   365  		}()
   366  	})
   367  	rollbackHolder := fr.NewRollbackHolder("syncer")
   368  	defer func() {
   369  		if err != nil {
   370  			rollbackHolder.RollbackReverseOrder()
   371  		}
   372  	}()
   373  
   374  	tctx := s.tctx.WithContext(ctx)
   375  	s.upstreamTZ, s.upstreamTZStr, err = str2TimezoneOrFromDB(tctx, "", conn.UpstreamDBConfig(&s.cfg.From))
   376  	if err != nil {
   377  		return
   378  	}
   379  	s.timezone, _, err = str2TimezoneOrFromDB(tctx, s.cfg.Timezone, conn.DownstreamDBConfig(&s.cfg.To))
   380  	if err != nil {
   381  		return
   382  	}
   383  
   384  	s.baList, err = filter.New(s.cfg.CaseSensitive, s.cfg.BAList)
   385  	if err != nil {
   386  		return terror.ErrSyncerUnitGenBAList.Delegate(err)
   387  	}
   388  
   389  	s.syncCfg, err = subtaskCfg2BinlogSyncerCfg(s.cfg, s.timezone, s.baList)
   390  	if err != nil {
   391  		return err
   392  	}
   393  
   394  	err = s.createDBs(ctx)
   395  	if err != nil {
   396  		return err
   397  	}
   398  	rollbackHolder.Add(fr.FuncRollback{Name: "close-DBs", Fn: s.closeDBs})
   399  
   400  	if s.cfg.CollationCompatible == config.StrictCollationCompatible {
   401  		s.charsetAndDefaultCollation, s.idAndCollationMap, err = dbconn.GetCharsetAndCollationInfo(tctx, s.fromConn)
   402  		if err != nil {
   403  			return err
   404  		}
   405  	}
   406  
   407  	s.streamerController = binlogstream.NewStreamerController(
   408  		s.syncCfg,
   409  		s.cfg.EnableGTID,
   410  		s.fromDB,
   411  		s.cfg.RelayDir,
   412  		s.timezone,
   413  		s.relay,
   414  		s.tctx.L(),
   415  	)
   416  
   417  	s.binlogFilter, err = bf.NewBinlogEvent(s.cfg.CaseSensitive, s.cfg.FilterRules)
   418  	if err != nil {
   419  		return terror.ErrSyncerUnitGenBinlogEventFilter.Delegate(err)
   420  	}
   421  
   422  	vars := map[string]string{
   423  		"time_zone": s.timezone.String(),
   424  	}
   425  	s.sessCtx = utils.NewSessionCtx(vars)
   426  	s.exprFilterGroup = NewExprFilterGroup(s.tctx, s.sessCtx, s.cfg.ExprFilter)
   427  	// create an empty Tracker and will be initialized in `Run`
   428  	s.schemaTracker = schema.NewTracker()
   429  
   430  	if s.cfg.OnlineDDL {
   431  		s.onlineDDL, err = onlineddl.NewRealOnlinePlugin(tctx, s.cfg, s.metricsProxies)
   432  		if err != nil {
   433  			return err
   434  		}
   435  		rollbackHolder.Add(fr.FuncRollback{Name: "close-onlineDDL", Fn: s.closeOnlineDDL})
   436  	}
   437  
   438  	err = s.genRouter()
   439  	if err != nil {
   440  		return err
   441  	}
   442  
   443  	var schemaMap map[string]string
   444  	var tableMap map[string]map[string]string
   445  	if s.SourceTableNamesFlavor == conn.LCTableNamesSensitive {
   446  		// TODO: we should avoid call this function multi times
   447  		allTables, err1 := conn.FetchAllDoTables(ctx, s.fromDB.BaseDB, s.baList)
   448  		if err1 != nil {
   449  			return err1
   450  		}
   451  		schemaMap, tableMap = buildLowerCaseTableNamesMap(s.tctx.L(), allTables)
   452  	}
   453  
   454  	switch s.cfg.ShardMode {
   455  	case config.ShardPessimistic:
   456  		err = s.sgk.Init()
   457  		if err != nil {
   458  			return err
   459  		}
   460  		err = s.initShardingGroups(ctx, true)
   461  		if err != nil {
   462  			return err
   463  		}
   464  		rollbackHolder.Add(fr.FuncRollback{Name: "close-sharding-group-keeper", Fn: s.sgk.Close})
   465  	case config.ShardOptimistic:
   466  		if err = s.initOptimisticShardDDL(ctx); err != nil {
   467  			return err
   468  		}
   469  	}
   470  
   471  	err = s.checkpoint.Init(tctx)
   472  	if err != nil {
   473  		return err
   474  	}
   475  
   476  	rollbackHolder.Add(fr.FuncRollback{Name: "close-checkpoint", Fn: s.checkpoint.Close})
   477  
   478  	err = s.checkpoint.Load(tctx)
   479  	if err != nil {
   480  		return err
   481  	}
   482  	if s.SourceTableNamesFlavor == conn.LCTableNamesSensitive {
   483  		if err = s.checkpoint.CheckAndUpdate(ctx, schemaMap, tableMap); err != nil {
   484  			return err
   485  		}
   486  
   487  		if s.onlineDDL != nil {
   488  			if err = s.onlineDDL.CheckAndUpdate(s.tctx, schemaMap, tableMap); err != nil {
   489  				return err
   490  			}
   491  		}
   492  	}
   493  
   494  	// when Init syncer, set active relay log info
   495  	if s.cfg.Meta == nil || s.cfg.Meta.BinLogName != binlog.FakeBinlogName {
   496  		err = s.setInitActiveRelayLog(ctx)
   497  		if err != nil {
   498  			return err
   499  		}
   500  		rollbackHolder.Add(fr.FuncRollback{Name: "remove-active-realylog", Fn: s.removeActiveRelayLog})
   501  	}
   502  	s.reset()
   503  
   504  	metricProxies := metrics.DefaultMetricsProxies
   505  	if s.cfg.MetricsFactory != nil {
   506  		metricProxies = &metrics.Proxies{}
   507  		metricProxies.Init(s.cfg.MetricsFactory)
   508  	}
   509  	s.metricsProxies = metricProxies.CacheForOneTask(s.cfg.Name, s.cfg.WorkerName, s.cfg.SourceID)
   510  
   511  	s.ddlWorker = NewDDLWorker(&s.tctx.Logger, s)
   512  	return nil
   513  }
   514  
   515  // buildLowerCaseTableNamesMap build a lower case schema map and lower case table map for all tables
   516  // Input: map of schema --> list of tables
   517  // Output: schema names map: lower_case_schema_name --> schema_name
   518  //
   519  //	tables names map: lower_case_schema_name --> lower_case_table_name --> table_name
   520  //
   521  // Note: the result will skip the schemas and tables that their lower_case_name are the same.
   522  func buildLowerCaseTableNamesMap(logger log.Logger, tables map[string][]string) (map[string]string, map[string]map[string]string) {
   523  	schemaMap := make(map[string]string)
   524  	tablesMap := make(map[string]map[string]string)
   525  	lowerCaseSchemaSet := make(map[string]string)
   526  	for schema, tableNames := range tables {
   527  		lcSchema := strings.ToLower(schema)
   528  		// track if there are multiple schema names with the same lower case name.
   529  		// just skip this kind of schemas.
   530  		if rawSchema, ok := lowerCaseSchemaSet[lcSchema]; ok {
   531  			delete(schemaMap, lcSchema)
   532  			delete(tablesMap, lcSchema)
   533  			logger.Warn("skip check schema with same lower case value",
   534  				zap.Strings("schemas", []string{schema, rawSchema}))
   535  			continue
   536  		}
   537  		lowerCaseSchemaSet[lcSchema] = schema
   538  
   539  		if lcSchema != schema {
   540  			schemaMap[lcSchema] = schema
   541  		}
   542  		tblsMap := make(map[string]string)
   543  		lowerCaseTableSet := make(map[string]string)
   544  		for _, tb := range tableNames {
   545  			lcTbl := strings.ToLower(tb)
   546  			if rawTbl, ok := lowerCaseTableSet[lcTbl]; ok {
   547  				delete(tblsMap, lcTbl)
   548  				logger.Warn("skip check tables with same lower case value", zap.String("schema", schema),
   549  					zap.Strings("table", []string{tb, rawTbl}))
   550  				continue
   551  			}
   552  			if lcTbl != tb {
   553  				tblsMap[lcTbl] = tb
   554  			}
   555  		}
   556  		if len(tblsMap) > 0 {
   557  			tablesMap[lcSchema] = tblsMap
   558  		}
   559  	}
   560  	return schemaMap, tablesMap
   561  }
   562  
   563  // initShardingGroups initializes sharding groups according to source MySQL, filter rules and router rules
   564  // NOTE: now we don't support modify router rules after task has started.
   565  func (s *Syncer) initShardingGroups(ctx context.Context, needCheck bool) error {
   566  	// fetch tables from source and filter them
   567  	sourceTables, err := s.fromDB.FetchAllDoTables(ctx, s.baList)
   568  	if err != nil {
   569  		return err
   570  	}
   571  
   572  	// convert according to router rules
   573  	// target-ID -> source-IDs
   574  	mapper := make(map[string][]string, len(sourceTables))
   575  	for schema, tables := range sourceTables {
   576  		for _, table := range tables {
   577  			sourceTable := &filter.Table{Schema: schema, Name: table}
   578  			targetTable := s.route(sourceTable)
   579  			targetID := utils.GenTableID(targetTable)
   580  			sourceID := utils.GenTableID(sourceTable)
   581  			_, ok := mapper[targetID]
   582  			if !ok {
   583  				mapper[targetID] = make([]string, 0, len(tables))
   584  			}
   585  			mapper[targetID] = append(mapper[targetID], sourceID)
   586  		}
   587  	}
   588  
   589  	loadMeta, err2 := s.sgk.LoadShardMeta(s.cfg.Flavor, s.cfg.EnableGTID)
   590  	if err2 != nil {
   591  		return err2
   592  	}
   593  	if needCheck && s.SourceTableNamesFlavor == conn.LCTableNamesSensitive {
   594  		// try fix persistent data before init
   595  		schemaMap, tableMap := buildLowerCaseTableNamesMap(s.tctx.L(), sourceTables)
   596  		if err2 = s.sgk.CheckAndFix(loadMeta, schemaMap, tableMap); err2 != nil {
   597  			return err2
   598  		}
   599  	}
   600  
   601  	// add sharding group
   602  	for targetID, sourceIDs := range mapper {
   603  		targetTable := utils.UnpackTableID(targetID)
   604  		_, _, _, _, err := s.sgk.AddGroup(targetTable, sourceIDs, loadMeta[targetID], false)
   605  		if err != nil {
   606  			return err
   607  		}
   608  	}
   609  
   610  	shardGroup := s.sgk.Groups()
   611  	s.tctx.L().Debug("initial sharding groups", zap.Int("shard group length", len(shardGroup)), zap.Reflect("shard group", shardGroup))
   612  
   613  	return nil
   614  }
   615  
   616  // IsFreshTask implements Unit.IsFreshTask.
   617  func (s *Syncer) IsFreshTask(ctx context.Context) (bool, error) {
   618  	globalPoint := s.checkpoint.GlobalPoint()
   619  	tablePoint := s.checkpoint.TablePoint()
   620  	// doesn't have neither GTID nor binlog pos
   621  	return binlog.IsFreshPosition(globalPoint, s.cfg.Flavor, s.cfg.EnableGTID) && len(tablePoint) == 0, nil
   622  }
   623  
   624  func (s *Syncer) reset() {
   625  	if s.streamerController != nil {
   626  		s.streamerController.Close()
   627  	}
   628  	s.secondsBehindMaster.Store(0)
   629  	for _, jobTS := range s.workerJobTSArray {
   630  		jobTS.Store(0)
   631  	}
   632  	// create new job chans
   633  	s.newJobChans()
   634  	s.checkpoint.DiscardPendingSnapshots()
   635  	s.checkpointFlushWorker = &checkpointFlushWorker{
   636  		input:              make(chan *checkpointFlushTask, 16),
   637  		cp:                 s.checkpoint,
   638  		execError:          &s.execError,
   639  		afterFlushFn:       s.afterFlushCheckpoint,
   640  		updateJobMetricsFn: s.updateJobMetrics,
   641  	}
   642  
   643  	s.execError.Store(nil)
   644  	s.setErrLocation(nil, nil, false)
   645  	s.waitXIDJob.Store(int64(noWait))
   646  	s.isTransactionEnd = true
   647  	s.flushSeq = 0
   648  	s.firstMeetBinlogTS = nil
   649  	s.exitSafeModeTS = nil
   650  	switch s.cfg.ShardMode {
   651  	case config.ShardPessimistic:
   652  		// every time start to re-sync from resume, we reset status to make it like a fresh syncing
   653  		s.sgk.ResetGroups()
   654  		s.pessimist.Reset()
   655  	case config.ShardOptimistic:
   656  		s.osgk.Reset()
   657  		s.optimist.Reset()
   658  	}
   659  }
   660  
   661  func (s *Syncer) resetDBs(tctx *tcontext.Context) error {
   662  	var err error
   663  
   664  	for i := 0; i < len(s.toDBConns); i++ {
   665  		err = s.toDBConns[i].ResetConn(tctx)
   666  		if err != nil {
   667  			return terror.WithScope(err, terror.ScopeDownstream)
   668  		}
   669  	}
   670  
   671  	if s.onlineDDL != nil {
   672  		err = s.onlineDDL.ResetConn(tctx)
   673  		if err != nil {
   674  			return terror.WithScope(err, terror.ScopeDownstream)
   675  		}
   676  	}
   677  
   678  	if s.sgk != nil {
   679  		err = s.sgk.dbConn.ResetConn(tctx)
   680  		if err != nil {
   681  			return terror.WithScope(err, terror.ScopeDownstream)
   682  		}
   683  	}
   684  
   685  	err = s.ddlDBConn.ResetConn(tctx)
   686  	if err != nil {
   687  		return terror.WithScope(err, terror.ScopeDownstream)
   688  	}
   689  
   690  	err = s.downstreamTrackConn.ResetConn(tctx)
   691  	if err != nil {
   692  		return terror.WithScope(err, terror.ScopeDownstream)
   693  	}
   694  
   695  	err = s.checkpoint.ResetConn(tctx)
   696  	if err != nil {
   697  		return terror.WithScope(err, terror.ScopeDownstream)
   698  	}
   699  
   700  	return nil
   701  }
   702  
   703  func (s *Syncer) handleExitErrMetric(err *pb.ProcessError) {
   704  	if unit.IsResumableError(err) {
   705  		s.metricsProxies.Metrics.ExitWithResumableErrorCounter.Inc()
   706  	} else {
   707  		s.metricsProxies.Metrics.ExitWithNonResumableErrorCounter.Inc()
   708  	}
   709  }
   710  
   711  // Process implements the dm.Unit interface.
   712  func (s *Syncer) Process(ctx context.Context, pr chan pb.ProcessResult) {
   713  	s.metricsProxies.Metrics.ExitWithResumableErrorCounter.Add(0)
   714  	s.metricsProxies.Metrics.ExitWithNonResumableErrorCounter.Add(0)
   715  
   716  	newCtx, cancel := context.WithCancel(ctx)
   717  	defer cancel()
   718  
   719  	// create new done chan
   720  	// use lock of Syncer to avoid Close while Process
   721  	s.Lock()
   722  	if s.isClosed() {
   723  		s.Unlock()
   724  		return
   725  	}
   726  	s.Unlock()
   727  
   728  	runFatalChan := make(chan *pb.ProcessError, s.cfg.WorkerCount+1)
   729  	s.runFatalChan = runFatalChan
   730  	var (
   731  		errs   = make([]*pb.ProcessError, 0, 2)
   732  		errsMu sync.Mutex
   733  	)
   734  
   735  	var wg sync.WaitGroup
   736  	wg.Add(1)
   737  	go func() {
   738  		defer wg.Done()
   739  		for {
   740  			err, ok := <-runFatalChan
   741  			if !ok {
   742  				return
   743  			}
   744  			cancel() // cancel s.Run
   745  			errsMu.Lock()
   746  			errs = append(errs, err)
   747  			errsMu.Unlock()
   748  		}
   749  	}()
   750  
   751  	wg.Add(1)
   752  	go func() {
   753  		defer wg.Done()
   754  		<-newCtx.Done() // ctx or newCtx
   755  	}()
   756  
   757  	err := s.Run(newCtx)
   758  	if err != nil {
   759  		// returned error rather than sent to runFatalChan
   760  		// cancel goroutines created in s.Run
   761  		cancel()
   762  	}
   763  	close(runFatalChan) // Run returned, all potential fatal sent to s.runFatalChan
   764  	wg.Wait()           // wait for receive all fatal from s.runFatalChan
   765  
   766  	if err != nil {
   767  		if utils.IsContextCanceledError(err) {
   768  			s.tctx.L().Info("filter out error caused by user cancel", log.ShortError(err))
   769  		} else {
   770  			s.tctx.L().Debug("unit syncer quits with error", zap.Error(err))
   771  			errsMu.Lock()
   772  			errs = append(errs, unit.NewProcessError(err))
   773  			errsMu.Unlock()
   774  		}
   775  	}
   776  
   777  	isCanceled := false
   778  	select {
   779  	case <-ctx.Done():
   780  		isCanceled = true
   781  	default:
   782  	}
   783  
   784  	for _, processError := range errs {
   785  		s.handleExitErrMetric(processError)
   786  	}
   787  	pr <- pb.ProcessResult{
   788  		IsCanceled: isCanceled,
   789  		Errors:     errs,
   790  	}
   791  }
   792  
   793  // getTableInfo returns a table info for sourceTable, it should not be modified by caller.
   794  func (s *Syncer) getTableInfo(tctx *tcontext.Context, sourceTable, targetTable *filter.Table) (*model.TableInfo, error) {
   795  	if s.schemaTracker == nil {
   796  		return nil, terror.ErrSchemaTrackerIsClosed.New("schema tracker not init")
   797  	}
   798  	ti, err := s.schemaTracker.GetTableInfo(sourceTable)
   799  	if err == nil {
   800  		return ti, nil
   801  	}
   802  	if !schema.IsTableNotExists(err) {
   803  		return nil, terror.ErrSchemaTrackerCannotGetTable.Delegate(err, sourceTable)
   804  	}
   805  
   806  	if err = s.schemaTracker.CreateSchemaIfNotExists(sourceTable.Schema); err != nil {
   807  		return nil, terror.ErrSchemaTrackerCannotCreateSchema.Delegate(err, sourceTable.Schema)
   808  	}
   809  
   810  	// if the table does not exist (IsTableNotExists(err)), continue to fetch the table from downstream and create it.
   811  	err = s.trackTableInfoFromDownstream(tctx, sourceTable, targetTable)
   812  	if err != nil {
   813  		return nil, err
   814  	}
   815  
   816  	ti, err = s.schemaTracker.GetTableInfo(sourceTable)
   817  	if err != nil {
   818  		return nil, terror.ErrSchemaTrackerCannotGetTable.Delegate(err, sourceTable)
   819  	}
   820  	return ti, nil
   821  }
   822  
   823  // getDBInfoFromDownstream tries to track the db info from the downstream. It will not overwrite existing table.
   824  func (s *Syncer) getDBInfoFromDownstream(tctx *tcontext.Context, sourceTable, targetTable *filter.Table) (*model.DBInfo, error) {
   825  	// TODO: Switch to use the HTTP interface to retrieve the TableInfo directly if HTTP port is available
   826  	// use parser for downstream.
   827  	parser2, err := dbconn.GetParserForConn(tctx, s.ddlDBConn)
   828  	if err != nil {
   829  		return nil, terror.ErrSchemaTrackerCannotParseDownstreamTable.Delegate(err, targetTable, sourceTable)
   830  	}
   831  
   832  	createSQL, err := dbconn.GetSchemaCreateSQL(tctx, s.ddlDBConn, targetTable.Schema)
   833  	if err != nil {
   834  		return nil, terror.ErrSchemaTrackerCannotFetchDownstreamTable.Delegate(err, targetTable, sourceTable)
   835  	}
   836  
   837  	createNode, err := parser2.ParseOneStmt(createSQL, "", "")
   838  	if err != nil {
   839  		return nil, terror.ErrSchemaTrackerCannotParseDownstreamTable.Delegate(err, targetTable, sourceTable)
   840  	}
   841  	stmt := createNode.(*ast.CreateDatabaseStmt)
   842  
   843  	// we only consider explicit charset/collate, if not found, fallback to default charset/collate.
   844  	charsetOpt := ast.CharsetOpt{}
   845  	for _, val := range stmt.Options {
   846  		switch val.Tp {
   847  		case ast.DatabaseOptionCharset:
   848  			charsetOpt.Chs = val.Value
   849  		case ast.DatabaseOptionCollate:
   850  			charsetOpt.Col = val.Value
   851  		}
   852  	}
   853  
   854  	chs, coll, err := tidbddl.ResolveCharsetCollation(nil, charsetOpt)
   855  	if err != nil {
   856  		return nil, errors.Trace(err)
   857  	}
   858  
   859  	return &model.DBInfo{Name: stmt.Name, Charset: chs, Collate: coll}, nil
   860  }
   861  
   862  // trackTableInfoFromDownstream tries to track the table info from the downstream. It will not overwrite existing table.
   863  func (s *Syncer) trackTableInfoFromDownstream(tctx *tcontext.Context, sourceTable, targetTable *filter.Table) error {
   864  	// TODO: Switch to use the HTTP interface to retrieve the TableInfo directly if HTTP port is available
   865  	// use parser for downstream.
   866  	parser2, err := dbconn.GetParserForConn(tctx, s.ddlDBConn)
   867  	if err != nil {
   868  		return terror.ErrSchemaTrackerCannotParseDownstreamTable.Delegate(err, targetTable, sourceTable)
   869  	}
   870  
   871  	createSQL, err := dbconn.GetTableCreateSQL(tctx, s.ddlDBConn, targetTable.String())
   872  	if err != nil {
   873  		return terror.ErrSchemaTrackerCannotFetchDownstreamTable.Delegate(err, targetTable, sourceTable)
   874  	}
   875  
   876  	// rename the table back to original.
   877  	var createNode ast.StmtNode
   878  	createNode, err = parser2.ParseOneStmt(createSQL, "", "")
   879  	if err != nil {
   880  		return terror.ErrSchemaTrackerCannotParseDownstreamTable.Delegate(err, targetTable, sourceTable)
   881  	}
   882  	createStmt := createNode.(*ast.CreateTableStmt)
   883  	createStmt.IfNotExists = true
   884  	createStmt.Table.Schema = model.NewCIStr(sourceTable.Schema)
   885  	createStmt.Table.Name = model.NewCIStr(sourceTable.Name)
   886  
   887  	// schema tracker sets non-clustered index, so can't handle auto_random.
   888  	for _, col := range createStmt.Cols {
   889  		for i, opt := range col.Options {
   890  			if opt.Tp == ast.ColumnOptionAutoRandom {
   891  				// col.Options is unordered
   892  				col.Options[i] = col.Options[len(col.Options)-1]
   893  				col.Options = col.Options[:len(col.Options)-1]
   894  				break
   895  			}
   896  		}
   897  	}
   898  
   899  	tctx.L().Debug("reverse-synchronized table schema",
   900  		zap.Stringer("sourceTable", sourceTable),
   901  		zap.Stringer("targetTable", targetTable),
   902  		zap.String("sql", createSQL),
   903  	)
   904  	if err = s.schemaTracker.Exec(tctx.Ctx, sourceTable.Schema, createStmt); err != nil {
   905  		return terror.ErrSchemaTrackerCannotCreateTable.Delegate(err, sourceTable)
   906  	}
   907  
   908  	return nil
   909  }
   910  
   911  var dmlMetric = map[sqlmodel.RowChangeType]string{
   912  	sqlmodel.RowChangeInsert: "insert",
   913  	sqlmodel.RowChangeUpdate: "update",
   914  	sqlmodel.RowChangeDelete: "delete",
   915  }
   916  
   917  func (s *Syncer) updateJobMetrics(isFinished bool, queueBucket string, j *job) {
   918  	tp := j.tp
   919  	targetTable := j.targetTable
   920  	count := 1
   921  	if tp == ddl {
   922  		count = len(j.ddls)
   923  	}
   924  
   925  	m := s.metricsProxies.AddedJobsTotal
   926  	if isFinished {
   927  		s.count.Add(int64(count))
   928  		m = s.metricsProxies.FinishedJobsTotal
   929  	}
   930  	switch tp {
   931  	case dml:
   932  		m.WithLabelValues(dmlMetric[j.dml.Type()], s.cfg.Name, queueBucket, s.cfg.SourceID, s.cfg.WorkerName, targetTable.Schema, targetTable.Name).Add(float64(count))
   933  	case ddl, flush, asyncFlush, conflict, compact:
   934  		m.WithLabelValues(tp.String(), s.cfg.Name, queueBucket, s.cfg.SourceID, s.cfg.WorkerName, targetTable.Schema, targetTable.Name).Add(float64(count))
   935  	case skip, xid:
   936  		// ignore skip/xid jobs
   937  	default:
   938  		s.tctx.L().Warn("unknown job operation type", zap.Stringer("type", j.tp))
   939  	}
   940  }
   941  
   942  func (s *Syncer) calcReplicationLag(headerTS int64) int64 {
   943  	return time.Now().Unix() - s.tsOffset.Load() - headerTS
   944  }
   945  
   946  // updateReplicationJobTS store job TS, it is called after every batch dml job / one skip job / one ddl job is added and committed.
   947  func (s *Syncer) updateReplicationJobTS(job *job, jobIdx int) {
   948  	// when job is nil mean no job in this bucket, need to reset this bucket job ts to 0
   949  	if job == nil {
   950  		s.workerJobTSArray[jobIdx].Store(0)
   951  	} else {
   952  		s.workerJobTSArray[jobIdx].Store(int64(job.eventHeader.Timestamp))
   953  	}
   954  }
   955  
   956  func (s *Syncer) updateReplicationLagMetric() {
   957  	var lag int64
   958  	var minTS int64
   959  
   960  	for idx := range s.workerJobTSArray {
   961  		if ts := s.workerJobTSArray[idx].Load(); ts != int64(0) {
   962  			if minTS == int64(0) || ts < minTS {
   963  				minTS = ts
   964  			}
   965  		}
   966  	}
   967  	if minTS != int64(0) {
   968  		lag = s.calcReplicationLag(minTS)
   969  	}
   970  
   971  	s.metricsProxies.Metrics.ReplicationLagHistogram.Observe(float64(lag))
   972  	s.metricsProxies.Metrics.ReplicationLagGauge.Set(float64(lag))
   973  	s.secondsBehindMaster.Store(lag)
   974  
   975  	failpoint.Inject("ShowLagInLog", func(v failpoint.Value) {
   976  		minLag := v.(int)
   977  		if int(lag) >= minLag {
   978  			s.tctx.L().Info("ShowLagInLog", zap.Int64("lag", lag))
   979  		}
   980  	})
   981  
   982  	// reset skip job TS in case of skip job TS is never updated
   983  	if minTS == s.workerJobTSArray[skipJobIdx].Load() {
   984  		s.workerJobTSArray[skipJobIdx].Store(0)
   985  	}
   986  }
   987  
   988  func (s *Syncer) saveTablePoint(table *filter.Table, location binlog.Location) {
   989  	ti, err := s.schemaTracker.GetTableInfo(table)
   990  	if err != nil && table.Name != "" {
   991  		// TODO: if we RENAME tb1 TO tb2, the tracker will remove TableInfo of tb1 but we still save the table
   992  		// checkpoint for tb1. We can delete the table checkpoint in future.
   993  		s.tctx.L().Warn("table info missing from schema tracker",
   994  			zap.Stringer("table", table),
   995  			zap.Stringer("location", location),
   996  			zap.Error(err))
   997  	}
   998  	s.checkpoint.SaveTablePoint(table, location, ti)
   999  }
  1000  
  1001  // only used in tests.
  1002  var (
  1003  	lastLocationForTest              binlog.Location
  1004  	lastLocationNumForTest           int
  1005  	waitJobsDoneForTest              bool
  1006  	failExecuteSQLForTest            bool
  1007  	failOnceForTest                  atomic.Bool
  1008  	waitBeforeRunExitDurationForTest time.Duration
  1009  )
  1010  
  1011  // TODO: move to syncer/job.go
  1012  // addJob adds one job to DML queue or DDL queue according to its type.
  1013  // Caller should prepare all needed jobs before calling this function, addJob should not generate any new jobs.
  1014  // There should not be a second way to send jobs to DML queue or DDL queue.
  1015  func (s *Syncer) addJob(job *job) {
  1016  	failpoint.Inject("countJobFromOneEvent", func() {
  1017  		if job.tp == dml {
  1018  			if job.currentLocation.Position.Compare(lastLocationForTest.Position) == 0 {
  1019  				lastLocationNumForTest++
  1020  			} else {
  1021  				lastLocationForTest = job.currentLocation
  1022  				lastLocationNumForTest = 1
  1023  			}
  1024  			// trigger a flush after see one job
  1025  			if lastLocationNumForTest == 1 {
  1026  				waitJobsDoneForTest = true
  1027  				s.tctx.L().Info("meet the first job of an event", zap.Any("binlog position", lastLocationForTest))
  1028  			}
  1029  			// mock a execution error after see two jobs.
  1030  			if lastLocationNumForTest == 2 {
  1031  				failExecuteSQLForTest = true
  1032  				s.tctx.L().Info("meet the second job of an event", zap.Any("binlog position", lastLocationForTest))
  1033  			}
  1034  		}
  1035  	})
  1036  	failpoint.Inject("countJobFromOneGTID", func() {
  1037  		if job.tp == dml {
  1038  			if binlog.CompareLocation(job.currentLocation, lastLocationForTest, true) == 0 {
  1039  				lastLocationNumForTest++
  1040  			} else {
  1041  				lastLocationForTest = job.currentLocation
  1042  				lastLocationNumForTest = 1
  1043  			}
  1044  			// trigger a flush after see one job
  1045  			if lastLocationNumForTest == 1 {
  1046  				waitJobsDoneForTest = true
  1047  				s.tctx.L().Info("meet the first job of a GTID", zap.Any("binlog position", lastLocationForTest))
  1048  			}
  1049  			// mock a execution error after see two jobs.
  1050  			if lastLocationNumForTest == 2 {
  1051  				failExecuteSQLForTest = true
  1052  				s.tctx.L().Info("meet the second job of a GTID", zap.Any("binlog position", lastLocationForTest))
  1053  			}
  1054  		}
  1055  	})
  1056  
  1057  	// avoid job.type data race with compactor.run()
  1058  	// simply copy the opType for performance, though copy a new job in compactor is better
  1059  	tp := job.tp
  1060  	switch tp {
  1061  	case flush:
  1062  		s.jobWg.Add(1)
  1063  		s.dmlJobCh <- job
  1064  	case asyncFlush:
  1065  		s.jobWg.Add(1)
  1066  		s.dmlJobCh <- job
  1067  	case ddl:
  1068  		s.updateJobMetrics(false, adminQueueName, job)
  1069  		s.jobWg.Add(1)
  1070  		startTime := time.Now()
  1071  		s.ddlJobCh <- job
  1072  		s.metricsProxies.AddJobDurationHistogram.WithLabelValues("ddl", s.cfg.Name, adminQueueName, s.cfg.SourceID).Observe(time.Since(startTime).Seconds())
  1073  	case dml:
  1074  		failpoint.Inject("SkipDML", func(val failpoint.Value) {
  1075  			// first col should be an int and primary key, every row with pk <= val will be skipped
  1076  			skippedIDUpperBound := val.(int)
  1077  			firstColVal, _ := strconv.Atoi(fmt.Sprintf("%v", job.dml.RowValues()[0]))
  1078  			if firstColVal <= skippedIDUpperBound {
  1079  				failpoint.Goto("skip_dml")
  1080  			}
  1081  		})
  1082  		s.dmlJobCh <- job
  1083  		failpoint.Label("skip_dml")
  1084  		failpoint.Inject("checkCheckpointInMiddleOfTransaction", func() {
  1085  			s.tctx.L().Info("receive dml job", zap.Any("dml job", job))
  1086  			time.Sleep(500 * time.Millisecond)
  1087  		})
  1088  	case gc:
  1089  		s.dmlJobCh <- job
  1090  	default:
  1091  		s.tctx.L().DPanic("unhandled job type", zap.Stringer("job", job))
  1092  	}
  1093  }
  1094  
  1095  // flushIfOutdated checks whether syncer should flush now because last flushing is outdated.
  1096  func (s *Syncer) flushIfOutdated() error {
  1097  	if !s.checkpoint.LastFlushOutdated() {
  1098  		return nil
  1099  	}
  1100  
  1101  	if s.cfg.Experimental.AsyncCheckpointFlush {
  1102  		jobSeq := s.getFlushSeq()
  1103  		s.tctx.L().Info("Start to async flush current checkpoint to downstream based on flush interval", zap.Int64("job sequence", jobSeq))
  1104  		j := newAsyncFlushJob(s.cfg.WorkerCount, jobSeq)
  1105  		s.addJob(j)
  1106  		s.flushCheckPointsAsync(j)
  1107  		return nil
  1108  	}
  1109  	return s.flushJobs()
  1110  }
  1111  
  1112  // TODO: move to syncer/job.go
  1113  // handleJob will do many actions based on job type.
  1114  func (s *Syncer) handleJob(job *job) (added2Queue bool, err error) {
  1115  	skipCheckFlush := false
  1116  	defer func() {
  1117  		if !skipCheckFlush && err == nil {
  1118  			if cutoverLocation := s.cutOverLocation.Load(); cutoverLocation != nil && binlog.CompareLocation(*cutoverLocation, job.currentLocation, s.cfg.EnableGTID) <= 0 {
  1119  				err = s.flushJobs()
  1120  				s.cutOverLocation.Store(nil)
  1121  			} else {
  1122  				err = s.flushIfOutdated()
  1123  			}
  1124  		}
  1125  	}()
  1126  
  1127  	// 1. handle jobs that not needed to be sent to queue
  1128  
  1129  	s.waitTransactionLock.Lock()
  1130  	defer s.waitTransactionLock.Unlock()
  1131  
  1132  	failpoint.Inject("checkCheckpointInMiddleOfTransaction", func() {
  1133  		if waitXIDStatus(s.waitXIDJob.Load()) == waiting {
  1134  			s.tctx.L().Info("not receive xid job yet", zap.Any("next job", job))
  1135  		}
  1136  	})
  1137  
  1138  	if waitXIDStatus(s.waitXIDJob.Load()) == waitComplete && job.tp != flush {
  1139  		s.tctx.L().Info("All jobs is completed before syncer close, the coming job will be reject", zap.Any("job", job))
  1140  		return
  1141  	}
  1142  
  1143  	switch job.tp {
  1144  	case xid:
  1145  		failpoint.Inject("SkipSaveGlobalPoint", func() {
  1146  			s.tctx.L().Info("skip save global point", zap.String("failpoint", "SkipSaveGlobalPoint"))
  1147  			panic("SkipSaveGlobalPoint")
  1148  		})
  1149  		s.waitXIDJob.CAS(int64(waiting), int64(waitComplete))
  1150  		s.saveGlobalPoint(job.location)
  1151  		s.isTransactionEnd = true
  1152  		return
  1153  	case skip:
  1154  		if job.eventHeader.EventType == replication.QUERY_EVENT {
  1155  			// skipped ddl includes:
  1156  			// - ddls that dm don't handle, such as analyze table(can be parsed), create function(cannot be parsed)
  1157  			// - ddls related to db/table which is filtered
  1158  			// for those ddls we record its location, so checkpoint can match master position if skipped ddl
  1159  			// is the last binlog in source db
  1160  			s.saveGlobalPoint(job.location)
  1161  		}
  1162  		s.updateReplicationJobTS(job, skipJobIdx)
  1163  		return
  1164  	}
  1165  
  1166  	// 2. send the job to queue
  1167  
  1168  	s.addJob(job)
  1169  	added2Queue = true
  1170  
  1171  	// 3. after job is sent to queue
  1172  
  1173  	switch job.tp {
  1174  	case dml:
  1175  		// DMl events will generate many jobs and only caller knows all jobs has been sent, so many logics are done by
  1176  		// caller
  1177  		s.isTransactionEnd = false
  1178  		skipCheckFlush = true
  1179  		return
  1180  	case ddl:
  1181  		s.jobWg.Wait()
  1182  
  1183  		// skip rest logic when downstream error
  1184  		if s.execError.Load() != nil {
  1185  			// nolint:nilerr
  1186  			return
  1187  		}
  1188  		s.updateReplicationJobTS(nil, ddlJobIdx) // clear ddl job ts because this ddl is already done.
  1189  		failpoint.Inject("ExitAfterDDLBeforeFlush", func() {
  1190  			s.tctx.L().Warn("exit triggered", zap.String("failpoint", "ExitAfterDDLBeforeFlush"))
  1191  			utils.OsExit(1)
  1192  		})
  1193  		// interrupted after executed DDL and before save checkpoint.
  1194  		failpoint.Inject("FlushCheckpointStage", func(val failpoint.Value) {
  1195  			err = handleFlushCheckpointStage(3, val.(int), "before save checkpoint")
  1196  			if err != nil {
  1197  				failpoint.Return()
  1198  			}
  1199  		})
  1200  		// save global checkpoint for DDL
  1201  		s.saveGlobalPoint(job.location)
  1202  		for sourceSchema, tbs := range job.sourceTbls {
  1203  			if len(sourceSchema) == 0 {
  1204  				continue
  1205  			}
  1206  			for _, sourceTable := range tbs {
  1207  				s.saveTablePoint(sourceTable, job.location)
  1208  			}
  1209  		}
  1210  		// reset sharding group after checkpoint saved
  1211  		s.resetShardingGroup(job.targetTable)
  1212  
  1213  		// interrupted after save checkpoint and before flush checkpoint.
  1214  		failpoint.Inject("FlushCheckpointStage", func(val failpoint.Value) {
  1215  			err = handleFlushCheckpointStage(4, val.(int), "before flush checkpoint")
  1216  			if err != nil {
  1217  				failpoint.Return()
  1218  			}
  1219  		})
  1220  		skipCheckFlush = true
  1221  		err = s.flushCheckPoints()
  1222  		return
  1223  	case flush:
  1224  		s.jobWg.Wait()
  1225  		skipCheckFlush = true
  1226  		err = s.flushCheckPoints()
  1227  		return
  1228  	case asyncFlush:
  1229  		skipCheckFlush = true
  1230  	}
  1231  	// nolint:nakedret
  1232  	return
  1233  }
  1234  
  1235  func (s *Syncer) saveGlobalPoint(globalLocation binlog.Location) {
  1236  	if s.cfg.ShardMode == config.ShardPessimistic {
  1237  		globalLocation = s.sgk.AdjustGlobalLocation(globalLocation)
  1238  	} else if s.cfg.ShardMode == config.ShardOptimistic {
  1239  		globalLocation = s.osgk.AdjustGlobalLocation(globalLocation)
  1240  	}
  1241  	s.checkpoint.SaveGlobalPoint(globalLocation)
  1242  }
  1243  
  1244  func (s *Syncer) resetShardingGroup(table *filter.Table) {
  1245  	if s.cfg.ShardMode == config.ShardPessimistic {
  1246  		// for DDL sharding group, reset group after checkpoint saved
  1247  		group := s.sgk.Group(table)
  1248  		if group != nil {
  1249  			group.Reset()
  1250  		}
  1251  	}
  1252  }
  1253  
  1254  // flushCheckPoints synchronously flushes previous saved checkpoint in memory to persistent storage, like TiDB
  1255  // we flush checkpoints in four cases:
  1256  //  1. DDL executed
  1257  //  2. pausing / stopping the sync (driven by `s.flushJobs`)
  1258  //  3. IsFreshTask return true
  1259  //  4. Heartbeat event received
  1260  //
  1261  // but when error occurred, we can not flush checkpoint, otherwise data may lost
  1262  // and except rejecting to flush the checkpoint, we also need to rollback the checkpoint saved before
  1263  // this should be handled when `s.Run` returned
  1264  //
  1265  // we may need to refactor the concurrency model to make the work-flow more clear later.
  1266  func (s *Syncer) flushCheckPoints() error {
  1267  	err := s.execError.Load()
  1268  	// TODO: for now, if any error occurred (including user canceled), checkpoint won't be updated. But if we have put
  1269  	// optimistic shard info, DM-master may resolved the optimistic lock and let other worker execute DDL. So after this
  1270  	// worker resume, it can not execute the DML/DDL in old binlog because of downstream table structure mismatching.
  1271  	// We should find a way to (compensating) implement a transaction containing interaction with both etcd and SQL.
  1272  	if err != nil && (terror.ErrDBExecuteFailed.Equal(err) || terror.ErrDBUnExpect.Equal(err)) {
  1273  		s.tctx.L().Warn("error detected when executing SQL job, skip sync flush checkpoints",
  1274  			zap.Stringer("checkpoint", s.checkpoint),
  1275  			zap.Error(err))
  1276  		return nil
  1277  	}
  1278  
  1279  	snapshotInfo, exceptTables, shardMetaSQLs, shardMetaArgs := s.createCheckpointSnapshot(true)
  1280  
  1281  	if snapshotInfo == nil {
  1282  		s.tctx.L().Debug("checkpoint has no change, skip sync flush checkpoint")
  1283  		return nil
  1284  	}
  1285  
  1286  	syncFlushErrCh := make(chan error, 1)
  1287  
  1288  	task := &checkpointFlushTask{
  1289  		snapshotInfo:   snapshotInfo,
  1290  		exceptTables:   exceptTables,
  1291  		shardMetaSQLs:  shardMetaSQLs,
  1292  		shardMetaArgs:  shardMetaArgs,
  1293  		asyncflushJob:  nil,
  1294  		syncFlushErrCh: syncFlushErrCh,
  1295  	}
  1296  	s.checkpointFlushWorker.Add(task)
  1297  
  1298  	return <-syncFlushErrCh
  1299  }
  1300  
  1301  // flushCheckPointsAsync asynchronous flushes checkpoint.
  1302  func (s *Syncer) flushCheckPointsAsync(asyncFlushJob *job) {
  1303  	err := s.execError.Load()
  1304  	// TODO: for now, if any error occurred (including user canceled), checkpoint won't be updated. But if we have put
  1305  	// optimistic shard info, DM-master may resolved the optimistic lock and let other worker execute DDL. So after this
  1306  	// worker resume, it can not execute the DML/DDL in old binlog because of downstream table structure mismatching.
  1307  	// We should find a way to (compensating) implement a transaction containing interaction with both etcd and SQL.
  1308  	if err != nil && (terror.ErrDBExecuteFailed.Equal(err) || terror.ErrDBUnExpect.Equal(err)) {
  1309  		s.tctx.L().Warn("error detected when executing SQL job, skip async flush checkpoints",
  1310  			zap.Stringer("checkpoint", s.checkpoint),
  1311  			zap.Error(err))
  1312  		return
  1313  	}
  1314  
  1315  	snapshotInfo, exceptTables, shardMetaSQLs, shardMetaArgs := s.createCheckpointSnapshot(false)
  1316  
  1317  	if snapshotInfo == nil {
  1318  		s.tctx.L().Debug("checkpoint has no change, skip async flush checkpoint", zap.Int64("job seq", asyncFlushJob.flushSeq))
  1319  		return
  1320  	}
  1321  
  1322  	task := &checkpointFlushTask{
  1323  		snapshotInfo:   snapshotInfo,
  1324  		exceptTables:   exceptTables,
  1325  		shardMetaSQLs:  shardMetaSQLs,
  1326  		shardMetaArgs:  shardMetaArgs,
  1327  		asyncflushJob:  asyncFlushJob,
  1328  		syncFlushErrCh: nil,
  1329  	}
  1330  	s.checkpointFlushWorker.Add(task)
  1331  }
  1332  
  1333  func (s *Syncer) createCheckpointSnapshot(isSyncFlush bool) (*SnapshotInfo, []*filter.Table, []string, [][]interface{}) {
  1334  	snapshotInfo := s.checkpoint.Snapshot(isSyncFlush)
  1335  	if snapshotInfo == nil {
  1336  		return nil, nil, nil, nil
  1337  	}
  1338  
  1339  	var (
  1340  		exceptTableIDs map[string]bool
  1341  		exceptTables   []*filter.Table
  1342  		shardMetaSQLs  []string
  1343  		shardMetaArgs  [][]interface{}
  1344  	)
  1345  	if s.cfg.ShardMode == config.ShardPessimistic {
  1346  		// flush all checkpoints except tables which are unresolved for sharding DDL for the pessimistic mode.
  1347  		// NOTE: for the optimistic mode, because we don't handle conflicts automatically (or no re-direct supported),
  1348  		// so we can simply flush checkpoint for all tables now, and after re-direct supported this should be updated.
  1349  		exceptTableIDs, exceptTables = s.sgk.UnresolvedTables()
  1350  		s.tctx.L().Info("flush checkpoints except for these tables", zap.Reflect("tables", exceptTables))
  1351  
  1352  		shardMetaSQLs, shardMetaArgs = s.sgk.PrepareFlushSQLs(exceptTableIDs)
  1353  		s.tctx.L().Info("prepare flush sqls", zap.Strings("shard meta sqls", shardMetaSQLs), zap.Reflect("shard meta arguments", shardMetaArgs))
  1354  	}
  1355  
  1356  	return snapshotInfo, exceptTables, shardMetaSQLs, shardMetaArgs
  1357  }
  1358  
  1359  func (s *Syncer) afterFlushCheckpoint(task *checkpointFlushTask) error {
  1360  	// add a gc job to let causality module gc outdated kvs.
  1361  	if task.asyncflushJob != nil {
  1362  		s.tctx.L().Info("after async flushed checkpoint, gc stale causality keys", zap.Int64("flush job seq", task.asyncflushJob.flushSeq))
  1363  		s.addJob(newGCJob(task.asyncflushJob.flushSeq))
  1364  	} else {
  1365  		s.tctx.L().Info("after sync flushed checkpoint, gc all causality keys")
  1366  		s.addJob(newGCJob(math.MaxInt64))
  1367  	}
  1368  
  1369  	// update current active relay log after checkpoint flushed
  1370  	err := s.updateActiveRelayLog(task.snapshotInfo.globalPos.Position)
  1371  	if err != nil {
  1372  		return err
  1373  	}
  1374  
  1375  	now := time.Now()
  1376  	if !s.lastCheckpointFlushedTime.IsZero() {
  1377  		duration := now.Sub(s.lastCheckpointFlushedTime).Seconds()
  1378  		s.metricsProxies.Metrics.FlushCheckPointsTimeInterval.Observe(duration)
  1379  	}
  1380  	s.lastCheckpointFlushedTime = now
  1381  
  1382  	s.logAndClearFilteredStatistics()
  1383  
  1384  	if s.cliArgs != nil && s.cliArgs.StartTime != "" && s.cli != nil {
  1385  		clone := *s.cliArgs
  1386  		clone.StartTime = ""
  1387  		err2 := ha.PutTaskCliArgs(s.cli, s.cfg.Name, []string{s.cfg.SourceID}, clone)
  1388  		if err2 != nil {
  1389  			s.tctx.L().Error("failed to clean start-time in task cli args", zap.Error(err2))
  1390  		} else {
  1391  			s.Lock()
  1392  			s.cliArgs.StartTime = ""
  1393  			s.Unlock()
  1394  		}
  1395  	}
  1396  	return nil
  1397  }
  1398  
  1399  func (s *Syncer) logAndClearFilteredStatistics() {
  1400  	filteredInsert := s.filteredInsert.Swap(0)
  1401  	filteredUpdate := s.filteredUpdate.Swap(0)
  1402  	filteredDelete := s.filteredDelete.Swap(0)
  1403  	if filteredInsert > 0 || filteredUpdate > 0 || filteredDelete > 0 {
  1404  		s.tctx.L().Info("after last flushing checkpoint, DM has ignored row changes by expression filter",
  1405  			zap.Int64("number of filtered insert", filteredInsert),
  1406  			zap.Int64("number of filtered update", filteredUpdate),
  1407  			zap.Int64("number of filtered delete", filteredDelete))
  1408  	}
  1409  }
  1410  
  1411  // DDL synced one by one, so we only need to process one DDL at a time.
  1412  func (s *Syncer) syncDDL(queueBucket string, db *dbconn.DBConn, ddlJobChan chan *job) {
  1413  	defer s.runWg.Done()
  1414  
  1415  	var err error
  1416  	for {
  1417  		ddlJob, ok := <-ddlJobChan
  1418  		if !ok {
  1419  			return
  1420  		}
  1421  
  1422  		// add this ddl ts beacause we start to exec this ddl.
  1423  		s.updateReplicationJobTS(ddlJob, ddlJobIdx)
  1424  
  1425  		failpoint.Inject("BlockDDLJob", func(v failpoint.Value) {
  1426  			t := v.(int) // sleep time
  1427  			s.tctx.L().Info("BlockDDLJob", zap.Any("job", ddlJob), zap.Int("sleep time", t))
  1428  			time.Sleep(time.Second * time.Duration(t))
  1429  		})
  1430  
  1431  		var (
  1432  			ignore           = false
  1433  			shardPessimistOp *pessimism.Operation
  1434  		)
  1435  		switch s.cfg.ShardMode {
  1436  		case config.ShardPessimistic:
  1437  			shardPessimistOp = s.pessimist.PendingOperation()
  1438  			if shardPessimistOp != nil && !shardPessimistOp.Exec {
  1439  				ignore = true
  1440  				s.tctx.L().Info("ignore shard DDLs in pessimistic shard mode", zap.Strings("ddls", ddlJob.ddls))
  1441  			}
  1442  		case config.ShardOptimistic:
  1443  			if len(ddlJob.ddls) == 0 {
  1444  				ignore = true
  1445  				s.tctx.L().Info("ignore shard DDLs in optimistic mode", zap.Stringer("info", s.optimist.PendingInfo()))
  1446  			}
  1447  		}
  1448  
  1449  		failpoint.Inject("ExecDDLError", func() {
  1450  			s.tctx.L().Warn("execute ddl error", zap.Strings("DDL", ddlJob.ddls), zap.String("failpoint", "ExecDDLError"))
  1451  			err = terror.ErrDBUnExpect.Delegate(errors.Errorf("execute ddl %v error", ddlJob.ddls))
  1452  			failpoint.Goto("bypass")
  1453  		})
  1454  
  1455  		if !ignore {
  1456  			failpoint.Inject("SkipSaveGlobalPoint", func() {
  1457  				s.tctx.L().Info("skip save global point", zap.String("failpoint", "SkipSaveGlobalPoint"))
  1458  				panic("SkipSaveGlobalPoint")
  1459  			})
  1460  			// set timezone
  1461  			if ddlJob.timezone != "" {
  1462  				s.timezoneLastTime = ddlJob.timezone
  1463  				setTimezoneSQL := fmt.Sprintf("SET SESSION TIME_ZONE = '%s'", ddlJob.timezone)
  1464  				ddlJob.ddls = append([]string{setTimezoneSQL}, ddlJob.ddls...)
  1465  				setTimezoneSQLDefault := "SET SESSION TIME_ZONE = DEFAULT"
  1466  				ddlJob.ddls = append(ddlJob.ddls, setTimezoneSQLDefault)
  1467  			} else if s.timezoneLastTime != "" {
  1468  				// use last time's time zone
  1469  				setTimezoneSQL := fmt.Sprintf("SET SESSION TIME_ZONE = '%s'", s.timezoneLastTime)
  1470  				ddlJob.ddls = append([]string{setTimezoneSQL}, ddlJob.ddls...)
  1471  				setTimezoneSQLDefault := "SET SESSION TIME_ZONE = DEFAULT"
  1472  				ddlJob.ddls = append(ddlJob.ddls, setTimezoneSQLDefault)
  1473  			}
  1474  			// set timestamp
  1475  			setTimestampSQL := fmt.Sprintf("SET TIMESTAMP = %d", ddlJob.timestamp)
  1476  			ddlJob.ddls = append([]string{setTimestampSQL}, ddlJob.ddls...)
  1477  			setTimestampSQLDefault := "SET TIMESTAMP = DEFAULT"
  1478  			ddlJob.ddls = append(ddlJob.ddls, setTimestampSQLDefault)
  1479  
  1480  			var affected int
  1481  			var ddlCreateTime int64 = -1 // default when scan failed
  1482  			row, err2 := db.QuerySQL(s.syncCtx, s.metricsProxies, "SELECT UNIX_TIMESTAMP()")
  1483  			if err2 != nil {
  1484  				s.tctx.L().Warn("selecting unix timestamp failed", zap.Error(err2))
  1485  			} else {
  1486  				for row.Next() {
  1487  					err2 = row.Scan(&ddlCreateTime)
  1488  					if err2 != nil {
  1489  						s.tctx.L().Warn("getting ddlCreateTime failed", zap.Error(err2))
  1490  					}
  1491  				}
  1492  				//nolint:sqlclosecheck
  1493  				_ = row.Close()
  1494  				_ = row.Err()
  1495  			}
  1496  			affected, err = db.ExecuteSQLWithIgnore(s.syncCtx, s.metricsProxies, errorutil.IsIgnorableMySQLDDLError, ddlJob.ddls)
  1497  			failpoint.Inject("TestHandleSpecialDDLError", func() {
  1498  				err = mysql2.ErrInvalidConn
  1499  				// simulate the value of affected along with the injected error due to the adding of SET SQL of timezone and timestamp
  1500  				if affected == 0 {
  1501  					affected++
  1502  				}
  1503  			})
  1504  			if err != nil {
  1505  				err = s.handleSpecialDDLError(s.syncCtx, err, ddlJob.ddls, affected, db, ddlCreateTime)
  1506  				err = terror.WithScope(err, terror.ScopeDownstream)
  1507  			}
  1508  		}
  1509  		failpoint.Label("bypass")
  1510  		failpoint.Inject("SafeModeExit", func(val failpoint.Value) {
  1511  			if intVal, ok := val.(int); ok && (intVal == 2 || intVal == 3) {
  1512  				s.tctx.L().Warn("mock safe mode error", zap.Strings("DDL", ddlJob.ddls), zap.String("failpoint", "SafeModeExit"))
  1513  				if intVal == 2 {
  1514  					err = terror.ErrWorkerDDLLockInfoNotFound.Generatef("DDL info not found")
  1515  				} else {
  1516  					err = terror.ErrDBExecuteFailed.Delegate(errors.Errorf("execute ddl %v error", ddlJob.ddls))
  1517  				}
  1518  			}
  1519  		})
  1520  		// If downstream has error (which may cause by tracker is more compatible than downstream), we should stop handling
  1521  		// this job, set `s.execError` to let caller of `addJob` discover error
  1522  		if err != nil {
  1523  			s.execError.Store(err)
  1524  			if !utils.IsContextCanceledError(err) {
  1525  				err = s.handleEventError(err, ddlJob.startLocation, ddlJob.currentLocation, true, ddlJob.originSQL)
  1526  				s.runFatalChan <- unit.NewProcessError(err)
  1527  			}
  1528  			s.jobWg.Done()
  1529  			continue
  1530  		}
  1531  
  1532  		switch s.cfg.ShardMode {
  1533  		case config.ShardPessimistic:
  1534  			// for sharding DDL syncing, send result back
  1535  			shardInfo := s.pessimist.PendingInfo()
  1536  			switch {
  1537  			case shardInfo == nil:
  1538  				// no need to do the shard DDL handle for `CREATE DATABASE/TABLE` now.
  1539  				s.tctx.L().Warn("skip shard DDL handle in pessimistic shard mode", zap.Strings("ddl", ddlJob.ddls))
  1540  			case shardPessimistOp == nil:
  1541  				err = terror.ErrWorkerDDLLockOpNotFound.Generate(shardInfo)
  1542  			default:
  1543  				err = s.pessimist.DoneOperationDeleteInfo(*shardPessimistOp, *shardInfo)
  1544  			}
  1545  		case config.ShardOptimistic:
  1546  			shardInfo := s.optimist.PendingInfo()
  1547  			switch {
  1548  			case shardInfo == nil:
  1549  				// no need to do the shard DDL handle for `DROP DATABASE/TABLE` now.
  1550  				// but for `CREATE DATABASE` and `ALTER DATABASE` we execute it to the downstream directly without `shardInfo`.
  1551  				if ignore { // actually ignored.
  1552  					s.tctx.L().Warn("skip shard DDL handle in optimistic shard mode", zap.Strings("ddl", ddlJob.ddls))
  1553  				}
  1554  			case s.optimist.PendingOperation() == nil:
  1555  				err = terror.ErrWorkerDDLLockOpNotFound.Generate(shardInfo)
  1556  			default:
  1557  				err = s.optimist.DoneOperation(*(s.optimist.PendingOperation()))
  1558  			}
  1559  		}
  1560  		if err != nil {
  1561  			if s.execError.Load() == nil {
  1562  				s.execError.Store(err)
  1563  			}
  1564  			if !utils.IsContextCanceledError(err) {
  1565  				err = s.handleEventError(err, ddlJob.startLocation, ddlJob.currentLocation, true, ddlJob.originSQL)
  1566  				s.runFatalChan <- unit.NewProcessError(err)
  1567  			}
  1568  			s.jobWg.Done()
  1569  			continue
  1570  		}
  1571  		s.jobWg.Done()
  1572  		s.updateJobMetrics(true, queueBucket, ddlJob)
  1573  	}
  1574  }
  1575  
  1576  func (s *Syncer) successFunc(queueID int, statementsCnt int, jobs []*job) {
  1577  	queueBucket := queueBucketName(queueID)
  1578  	if len(jobs) > 0 {
  1579  		// NOTE: we can use the first job of job queue to calculate lag because when this job committed,
  1580  		// every event before this job's event in this queue has already commit.
  1581  		// and we can use this job to maintain the oldest binlog event ts among all workers.
  1582  		j := jobs[0]
  1583  		switch j.tp {
  1584  		case ddl:
  1585  			s.metricsProxies.BinlogEventCost.WithLabelValues(metrics.BinlogEventCostStageDDLExec, s.cfg.Name, s.cfg.WorkerName, s.cfg.SourceID).Observe(time.Since(j.jobAddTime).Seconds())
  1586  		case dml:
  1587  			s.metricsProxies.BinlogEventCost.WithLabelValues(metrics.BinlogEventCostStageDMLExec, s.cfg.Name, s.cfg.WorkerName, s.cfg.SourceID).Observe(time.Since(j.jobAddTime).Seconds())
  1588  			// metric only increases by 1 because dm batches sql jobs in a single transaction.
  1589  			s.metricsProxies.Metrics.FinishedTransactionTotal.Inc()
  1590  		}
  1591  	}
  1592  
  1593  	for _, sqlJob := range jobs {
  1594  		s.updateJobMetrics(true, queueBucket, sqlJob)
  1595  	}
  1596  	s.updateReplicationJobTS(nil, dmlWorkerJobIdx(queueID))
  1597  	s.metricsProxies.ReplicationTransactionBatch.WithLabelValues(s.cfg.WorkerName, s.cfg.Name, s.cfg.SourceID, queueBucket, "statements").Observe(float64(statementsCnt))
  1598  	s.metricsProxies.ReplicationTransactionBatch.WithLabelValues(s.cfg.WorkerName, s.cfg.Name, s.cfg.SourceID, queueBucket, "rows").Observe(float64(len(jobs)))
  1599  }
  1600  
  1601  func (s *Syncer) fatalFunc(job *job, err error) {
  1602  	s.execError.Store(err)
  1603  	if !utils.IsContextCanceledError(err) {
  1604  		err = s.handleEventError(err, job.startLocation, job.currentLocation, false, "")
  1605  		s.runFatalChan <- unit.NewProcessError(err)
  1606  	}
  1607  }
  1608  
  1609  // DML synced with causality.
  1610  func (s *Syncer) syncDML() {
  1611  	defer s.runWg.Done()
  1612  
  1613  	dmlJobCh := s.dmlJobCh
  1614  	if s.cfg.Compact {
  1615  		dmlJobCh = compactorWrap(dmlJobCh, s)
  1616  	}
  1617  	causalityCh := causalityWrap(dmlJobCh, s)
  1618  	flushCh := dmlWorkerWrap(causalityCh, s)
  1619  
  1620  	for range flushCh {
  1621  		s.jobWg.Done()
  1622  	}
  1623  }
  1624  
  1625  func (s *Syncer) waitBeforeRunExit(ctx context.Context) {
  1626  	defer s.runWg.Done()
  1627  	failpoint.Inject("checkCheckpointInMiddleOfTransaction", func() {
  1628  		s.tctx.L().Info("incr maxPauseOrStopWaitTime time ")
  1629  		defaultMaxPauseOrStopWaitTime = time.Minute * 10
  1630  	})
  1631  	select {
  1632  	case <-ctx.Done(): // hijack the root context from s.Run to wait for the transaction to end.
  1633  		s.tctx.L().Info("received subtask's done, try graceful stop")
  1634  		needToExitTime := time.Now()
  1635  		s.waitTransactionLock.Lock()
  1636  
  1637  		failpoint.Inject("checkWaitDuration", func(_ failpoint.Value) {
  1638  			s.isTransactionEnd = false
  1639  		})
  1640  		if s.isTransactionEnd {
  1641  			s.waitXIDJob.Store(int64(waitComplete))
  1642  			s.waitTransactionLock.Unlock()
  1643  			s.tctx.L().Info("the last job is transaction end, done directly")
  1644  			s.runCancel()
  1645  			return
  1646  		}
  1647  		s.waitXIDJob.Store(int64(waiting))
  1648  		s.waitTransactionLock.Unlock()
  1649  		s.refreshCliArgs()
  1650  
  1651  		waitDuration := defaultMaxPauseOrStopWaitTime
  1652  		if s.cliArgs != nil && s.cliArgs.WaitTimeOnStop != "" {
  1653  			waitDuration, _ = time.ParseDuration(s.cliArgs.WaitTimeOnStop)
  1654  		}
  1655  		prepareForWaitTime := time.Since(needToExitTime)
  1656  		failpoint.Inject("recordAndIgnorePrepareTime", func() {
  1657  			prepareForWaitTime = time.Duration(0)
  1658  		})
  1659  		waitDuration -= prepareForWaitTime
  1660  		failpoint.Inject("recordAndIgnorePrepareTime", func() {
  1661  			waitBeforeRunExitDurationForTest = waitDuration
  1662  		})
  1663  		if waitDuration.Seconds() <= 0 {
  1664  			s.tctx.L().Info("wait transaction end timeout, exit now")
  1665  			s.runCancel()
  1666  			return
  1667  		}
  1668  		failpoint.Inject("checkWaitDuration", func(val failpoint.Value) {
  1669  			if testDuration, testError := time.ParseDuration(val.(string)); testError == nil {
  1670  				if testDuration.Seconds() == waitDuration.Seconds() {
  1671  					panic("success check wait_time_on_stop !!!")
  1672  				} else {
  1673  					s.tctx.L().Error("checkWaitDuration fail", zap.Duration("testDuration", testDuration), zap.Duration("waitDuration", waitDuration))
  1674  				}
  1675  			} else {
  1676  				s.tctx.L().Error("checkWaitDuration error", zap.Error(testError))
  1677  			}
  1678  		})
  1679  
  1680  		select {
  1681  		case <-s.runCtx.Ctx.Done():
  1682  			s.tctx.L().Info("syncer run exit so runCtx done")
  1683  		case <-time.After(waitDuration):
  1684  			s.tctx.L().Info("wait transaction end timeout, exit now")
  1685  			s.runCancel()
  1686  		}
  1687  	case <-s.runCtx.Ctx.Done(): // when no graceful stop, run ctx will canceled first.
  1688  		s.tctx.L().Info("received ungraceful exit ctx, exit now")
  1689  	}
  1690  }
  1691  
  1692  func (s *Syncer) updateTSOffsetCronJob(ctx context.Context) {
  1693  	defer s.runWg.Done()
  1694  	// temporarily hard code there. if this Proxies works well add this to config file.
  1695  	ticker := time.NewTicker(time.Minute * 10)
  1696  	defer ticker.Stop()
  1697  	for {
  1698  		select {
  1699  		case <-ticker.C:
  1700  			if utErr := s.updateTSOffset(ctx); utErr != nil {
  1701  				s.tctx.L().Error("get server unix ts err", zap.Error(utErr))
  1702  			}
  1703  		case <-ctx.Done():
  1704  			return
  1705  		}
  1706  	}
  1707  }
  1708  
  1709  func (s *Syncer) updateLagCronJob(ctx context.Context) {
  1710  	defer s.runWg.Done()
  1711  	// temporarily hard code there. if this Proxies works well add this to config file.
  1712  	ticker := time.NewTicker(time.Millisecond * 100)
  1713  	defer ticker.Stop()
  1714  	for {
  1715  		select {
  1716  		case <-ticker.C:
  1717  			s.updateReplicationLagMetric()
  1718  		case <-ctx.Done():
  1719  			return
  1720  		}
  1721  	}
  1722  }
  1723  
  1724  func (s *Syncer) updateTSOffset(ctx context.Context) error {
  1725  	t1 := time.Now()
  1726  	ts, tsErr := s.fromDB.GetServerUnixTS(ctx)
  1727  	rtt := time.Since(t1).Seconds()
  1728  	if tsErr == nil {
  1729  		s.tsOffset.Store(time.Now().Unix() - ts - int64(rtt/2))
  1730  	}
  1731  	return tsErr
  1732  }
  1733  
  1734  // Run starts running for sync, we should guarantee it can rerun when paused.
  1735  func (s *Syncer) Run(ctx context.Context) (err error) {
  1736  	runCtx, runCancel := context.WithCancel(context.Background())
  1737  	s.runCtx, s.runCancel = tcontext.NewContext(runCtx, s.tctx.L()), runCancel
  1738  	syncCtx, syncCancel := context.WithCancel(context.Background())
  1739  	s.syncCtx, s.syncCancel = tcontext.NewContext(syncCtx, s.tctx.L()), syncCancel
  1740  	defer func() {
  1741  		s.runCancel()
  1742  		s.closeJobChans()
  1743  		s.checkpointFlushWorker.Close()
  1744  		s.runWg.Wait()
  1745  		// s.syncCancel won't be called when normal exit, this call just to follow the best practice of use context.
  1746  		s.syncCancel()
  1747  	}()
  1748  
  1749  	// we should start this goroutine as soon as possible, because it's the only goroutine that cancel syncer.Run
  1750  	s.runWg.Add(1)
  1751  	go func() {
  1752  		s.waitBeforeRunExit(ctx)
  1753  	}()
  1754  
  1755  	// before sync run, we get the ts offset from upstream first
  1756  	if utErr := s.updateTSOffset(ctx); utErr != nil {
  1757  		return utErr
  1758  	}
  1759  
  1760  	// some initialization that can't be put in Syncer.Init
  1761  	fresh, err := s.IsFreshTask(s.runCtx.Ctx)
  1762  	if err != nil {
  1763  		return err
  1764  	}
  1765  
  1766  	// task command line arguments have the highest priority
  1767  	skipLoadMeta := false
  1768  	s.refreshCliArgs()
  1769  	if s.cliArgs != nil && s.cliArgs.StartTime != "" {
  1770  		err = s.setGlobalPointByTime(s.runCtx, s.cliArgs.StartTime)
  1771  		if terror.ErrConfigStartTimeTooLate.Equal(err) {
  1772  			return err
  1773  		}
  1774  		skipLoadMeta = err == nil
  1775  	}
  1776  
  1777  	// some initialization that can't be put in Syncer.Init
  1778  	if fresh && !skipLoadMeta {
  1779  		// for fresh task, we try to load checkpoints from meta (file or config item)
  1780  		err = s.checkpoint.LoadMeta(runCtx)
  1781  		if err != nil {
  1782  			return err
  1783  		}
  1784  	}
  1785  
  1786  	var (
  1787  		flushCheckpoint bool
  1788  		delLoadTask     bool
  1789  		cleanDumpFile   = s.cfg.CleanDumpFile
  1790  		freshAndAllMode bool
  1791  	)
  1792  	flushCheckpoint, err = s.adjustGlobalPointGTID(s.runCtx)
  1793  	if err != nil {
  1794  		return err
  1795  	}
  1796  	if fresh && config.HasLoad(s.cfg.Mode) {
  1797  		delLoadTask = true
  1798  		flushCheckpoint = true
  1799  		freshAndAllMode = true
  1800  	}
  1801  
  1802  	if s.cfg.Mode == config.ModeIncrement || !fresh {
  1803  		cleanDumpFile = false
  1804  	}
  1805  
  1806  	s.runWg.Add(1)
  1807  	go s.syncDML()
  1808  	s.runWg.Add(1)
  1809  	go func() {
  1810  		defer s.runWg.Done()
  1811  		// also need to use a different ctx. checkpointFlushWorker worker will be closed in the first defer
  1812  		s.checkpointFlushWorker.Run(s.tctx)
  1813  	}()
  1814  	s.runWg.Add(1)
  1815  	go s.syncDDL(adminQueueName, s.ddlDBConn, s.ddlJobCh)
  1816  	s.runWg.Add(1)
  1817  	go s.updateLagCronJob(s.runCtx.Ctx)
  1818  	s.runWg.Add(1)
  1819  	go s.updateTSOffsetCronJob(s.runCtx.Ctx)
  1820  
  1821  	// some prepare work before the binlog event loop:
  1822  	// 1. first we flush checkpoint as needed, so in next resume we won't go to Load unit.
  1823  	// 2. then since we are confident that Load unit is done we can delete the load task etcd KV.
  1824  	//    TODO: we can't handle panic between 1. and 2., or fail to delete the load task etcd KV.
  1825  	// 3. then we initiate schema tracker
  1826  	// 4. - when it's a fresh task, load the table structure from dump files into schema tracker,
  1827  	//      and flush them into checkpoint again.
  1828  	//    - when it's a resumed task, load the table structure from checkpoints into schema tracker.
  1829  	//    TODO: we can't handle failure between 1. and 4. After 1. it's not a fresh task.
  1830  	// 5. finally clean the dump files
  1831  
  1832  	if flushCheckpoint {
  1833  		if err = s.flushCheckPoints(); err != nil {
  1834  			s.tctx.L().Warn("fail to flush checkpoints when starting task", zap.Error(err))
  1835  			return err
  1836  		}
  1837  	}
  1838  	if delLoadTask {
  1839  		if err = s.delLoadTask(); err != nil {
  1840  			s.tctx.L().Error("error when del load task in etcd", zap.Error(err))
  1841  		}
  1842  	}
  1843  
  1844  	if s.schemaTracker == nil {
  1845  		// some test will set their own schemaTracker and skip the syncer.Init
  1846  		s.schemaTracker = schema.NewTracker()
  1847  	}
  1848  	// prevent creating new Tracker on `Run` in order to avoid
  1849  	// two different Trackers are invoked in the validator and the syncer.
  1850  	err = s.schemaTracker.Init(ctx, s.cfg.Name, int(s.SourceTableNamesFlavor), s.downstreamTrackConn, s.tctx.L())
  1851  	if err != nil {
  1852  		return terror.ErrSchemaTrackerInit.Delegate(err)
  1853  	}
  1854  
  1855  	if freshAndAllMode {
  1856  		err = s.loadTableStructureFromDump(ctx)
  1857  		if err != nil {
  1858  			s.tctx.L().Warn("error happened when load table structure from dump files", zap.Error(err))
  1859  			cleanDumpFile = false
  1860  		} else {
  1861  			err = s.flushCheckPoints()
  1862  			if err != nil {
  1863  				s.tctx.L().Warn("error happened when flush table structure from dump files", zap.Error(err))
  1864  				cleanDumpFile = false
  1865  			}
  1866  		}
  1867  	} else {
  1868  		err = s.checkpoint.LoadIntoSchemaTracker(ctx, s.schemaTracker)
  1869  		if err != nil {
  1870  			return err
  1871  		}
  1872  	}
  1873  
  1874  	if cleanDumpFile {
  1875  		s.tctx.L().Info("try to remove all dump files")
  1876  		if err = storage.RemoveAll(ctx, s.cfg.Dir, nil); err != nil {
  1877  			s.tctx.L().Warn("error when remove loaded dump folder", zap.String("data folder", s.cfg.Dir), zap.Error(err))
  1878  		}
  1879  	}
  1880  
  1881  	failpoint.Inject("AdjustGTIDExit", func() {
  1882  		s.tctx.L().Warn("exit triggered", zap.String("failpoint", "AdjustGTIDExit"))
  1883  		s.streamerController.Close()
  1884  		utils.OsExit(1)
  1885  	})
  1886  
  1887  	var (
  1888  		endLocation        = s.checkpoint.GlobalPoint() // also init to global checkpoint
  1889  		lastTxnEndLocation = s.checkpoint.GlobalPoint()
  1890  
  1891  		currentGTID string
  1892  	)
  1893  	s.tctx.L().Info("replicate binlog from checkpoint", zap.Stringer("checkpoint", lastTxnEndLocation))
  1894  
  1895  	if s.streamerController.IsClosed() {
  1896  		err = s.streamerController.Start(s.runCtx, lastTxnEndLocation)
  1897  		if err != nil {
  1898  			return terror.Annotate(err, "fail to restart streamer controller")
  1899  		}
  1900  	}
  1901  	// syncing progress with sharding DDL group
  1902  	// 1. use the global streamer to sync regular binlog events
  1903  	// 2. sharding DDL synced for some sharding groups
  1904  	//    * record first pos, last pos, target schema, target table as re-sync info
  1905  	// 3. use the re-sync info recorded in step.2 to create a new streamer
  1906  	// 4. use the new streamer re-syncing for this sharding group
  1907  	// 5. in sharding group's re-syncing
  1908  	//    * ignore other tables' binlog events
  1909  	//    * compare last pos with current binlog's pos to determine whether re-sync completed
  1910  	// 6. use the global streamer to continue the syncing
  1911  	var (
  1912  		shardingReSyncCh        = make(chan *ShardingReSync, 10)
  1913  		shardingReSync          *ShardingReSync
  1914  		savedGlobalLastLocation binlog.Location
  1915  		traceSource             = fmt.Sprintf("%s.syncer.%s", s.cfg.SourceID, s.cfg.Name)
  1916  		lastEvent               *replication.BinlogEvent
  1917  	)
  1918  
  1919  	// this is second defer func in syncer.Run so in this time checkpointFlushWorker are still running
  1920  	defer func() {
  1921  		if err1 := recover(); err1 != nil {
  1922  			failpoint.Inject("ExitAfterSaveOnlineDDL", func() {
  1923  				s.tctx.L().Info("force panic")
  1924  				panic("ExitAfterSaveOnlineDDL")
  1925  			})
  1926  			s.tctx.L().Error("panic log", zap.Reflect("error message", err1), zap.Stack("stack"))
  1927  			err = terror.ErrSyncerUnitPanic.Generate(err1)
  1928  		}
  1929  
  1930  		var (
  1931  			err2            error
  1932  			exitSafeModeLoc = endLocation
  1933  		)
  1934  		saveExitSafeModeLoc := func(loc binlog.Location) {
  1935  			if binlog.CompareLocation(loc, exitSafeModeLoc, s.cfg.EnableGTID) > 0 {
  1936  				exitSafeModeLoc = loc
  1937  			}
  1938  		}
  1939  		saveExitSafeModeLoc(savedGlobalLastLocation)
  1940  		if s.cfg.ShardMode == config.ShardOptimistic {
  1941  			if shardingReSync != nil {
  1942  				saveExitSafeModeLoc(shardingReSync.latestLocation)
  1943  			}
  1944  			for _, latestLoc := range s.osgk.getShardingResyncs() {
  1945  				saveExitSafeModeLoc(latestLoc)
  1946  			}
  1947  		}
  1948  		s.checkpoint.SaveSafeModeExitPoint(&exitSafeModeLoc)
  1949  
  1950  		// flush all jobs before exit
  1951  		if err2 = s.flushJobs(); err2 != nil {
  1952  			s.tctx.L().Warn("failed to flush jobs when exit task", zap.Error(err2))
  1953  		}
  1954  
  1955  		// if any execute error, flush safemode exit point
  1956  		if err2 = s.execError.Load(); err2 != nil && (terror.ErrDBExecuteFailed.Equal(err2) || terror.ErrDBUnExpect.Equal(err2)) {
  1957  			if err2 = s.checkpoint.FlushSafeModeExitPoint(s.tctx); err2 != nil {
  1958  				s.tctx.L().Warn("failed to flush safe mode checkpoints when exit task", zap.Error(err2))
  1959  			}
  1960  		}
  1961  	}()
  1962  
  1963  	now := time.Now()
  1964  	s.start.Store(now)
  1965  	s.lastTime.Store(now)
  1966  
  1967  	s.initInitExecutedLoc()
  1968  	s.running.Store(true)
  1969  	defer s.running.Store(false)
  1970  
  1971  	// safeMode makes syncer reentrant.
  1972  	// we make each operator reentrant to make syncer reentrant.
  1973  	// `replace` and `delete` are naturally reentrant.
  1974  	// use `delete`+`replace` to represent `update` can make `update`  reentrant.
  1975  	// but there are no ways to make `update` idempotent,
  1976  	// if we start syncer at an early position, database must bear a period of inconsistent state,
  1977  	// it's eventual consistency.
  1978  	s.enableSafeModeInitializationPhase(s.runCtx)
  1979  
  1980  	closeShardingResync := func() error {
  1981  		if shardingReSync == nil {
  1982  			return nil
  1983  		}
  1984  
  1985  		var (
  1986  			err2         error
  1987  			nextLocation = savedGlobalLastLocation
  1988  		)
  1989  
  1990  		// if remaining DDLs in sequence, redirect global stream to the next sharding DDL position
  1991  		if !shardingReSync.allResolved {
  1992  			nextLocation, err2 = s.sgk.ActiveDDLFirstLocation(shardingReSync.targetTable)
  1993  			if err2 != nil {
  1994  				return err2
  1995  			}
  1996  		}
  1997  
  1998  		err3 := s.streamerController.ResetReplicationSyncer(s.tctx, nextLocation)
  1999  		if err3 != nil {
  2000  			return err3
  2001  		}
  2002  
  2003  		shardingReSync = nil
  2004  		return nil
  2005  	}
  2006  
  2007  	maybeSkipNRowsEvent := func(n int) error {
  2008  		if n == 0 {
  2009  			return nil
  2010  		}
  2011  
  2012  		for i := 0; i < n; {
  2013  			e, _, err1 := s.streamerController.GetEvent(s.runCtx)
  2014  			if err1 != nil {
  2015  				return err
  2016  			}
  2017  			switch e.Event.(type) {
  2018  			case *replication.GTIDEvent, *replication.MariadbGTIDEvent:
  2019  				gtidStr, err2 := event.GetGTIDStr(e)
  2020  				if err2 != nil {
  2021  					return err2
  2022  				}
  2023  				if currentGTID != gtidStr {
  2024  					s.tctx.L().Error("after recover GTID-based replication, the first GTID is not same as broken one. May meet duplicate entry or corrupt data if table has no PK/UK.",
  2025  						zap.String("last GTID", currentGTID),
  2026  						zap.String("GTID after reset", gtidStr),
  2027  					)
  2028  					return nil
  2029  				}
  2030  			case *replication.RowsEvent:
  2031  				i++
  2032  			}
  2033  		}
  2034  		s.tctx.L().Info("discard event already consumed", zap.Int("count", n),
  2035  			zap.Any("start location", s.streamerController.GetCurStartLocation()),
  2036  			zap.Any("end location", s.streamerController.GetCurEndLocation()))
  2037  		return nil
  2038  	}
  2039  
  2040  	// eventIndex is the rows event index in this transaction, it's used to avoiding read duplicate event in gtid mode
  2041  	eventIndex := 0
  2042  	// affectedSourceTables is used for gtid mode to update table point's gtid set after receiving a xid event,
  2043  	// which means this whole event is finished
  2044  	affectedSourceTables := make(map[string]map[string]struct{}, 0)
  2045  	// the relay log file may be truncated(not end with an RotateEvent), in this situation, we may read some rows events
  2046  	// and then read from the gtid again, so we force enter safe-mode for one more transaction to avoid failure due to
  2047  	// conflict
  2048  
  2049  	// binlog event loop
  2050  	for {
  2051  		if s.execError.Load() != nil {
  2052  			return nil
  2053  		}
  2054  		s.currentLocationMu.Lock()
  2055  		s.currentLocationMu.currentLocation = endLocation
  2056  		s.currentLocationMu.Unlock()
  2057  
  2058  		failpoint.Inject("FakeRedirect", func(val failpoint.Value) {
  2059  			if len(shardingReSyncCh) == 0 && shardingReSync == nil {
  2060  				if strVal, ok := val.(string); ok {
  2061  					pos, gtidSet, err2 := s.fromDB.GetMasterStatus(s.tctx.WithContext(ctx), s.cfg.Flavor)
  2062  					if err2 != nil {
  2063  						s.tctx.L().Error("fail to get master status in failpoint FakeRedirect", zap.Error(err2))
  2064  						os.Exit(1)
  2065  					}
  2066  					loc := binlog.NewLocation(pos, gtidSet)
  2067  					s.tctx.L().Info("fake redirect", zap.Stringer("endLocation", endLocation), zap.Stringer("latestLocation", loc))
  2068  					resync := &ShardingReSync{
  2069  						currLocation:   endLocation.Clone(),
  2070  						latestLocation: loc,
  2071  						targetTable:    utils.UnpackTableID(strVal),
  2072  						allResolved:    true,
  2073  					}
  2074  					shardingReSyncCh <- resync
  2075  				}
  2076  			}
  2077  		})
  2078  		// fetch from sharding resync channel if needed, and redirect global
  2079  		// stream to current binlog position recorded by ShardingReSync
  2080  		if len(shardingReSyncCh) > 0 && shardingReSync == nil {
  2081  			// some sharding groups need to re-syncing
  2082  			shardingReSync = <-shardingReSyncCh
  2083  			s.tctx.L().Debug("starts to handle new shardingResync operation", zap.Stringer("shardingResync", shardingReSync))
  2084  			savedGlobalLastLocation = lastTxnEndLocation // save global last location
  2085  			lastTxnEndLocation = shardingReSync.currLocation
  2086  
  2087  			// remove sharding resync global checkpoint location limit from optimistic sharding group keeper
  2088  			if s.cfg.ShardMode == config.ShardOptimistic {
  2089  				s.osgk.removeShardingReSync(shardingReSync)
  2090  			}
  2091  
  2092  			err = s.streamerController.ResetReplicationSyncer(s.runCtx, shardingReSync.currLocation)
  2093  			if err != nil {
  2094  				return err
  2095  			}
  2096  
  2097  			failpoint.Inject("ReSyncExit", func() {
  2098  				s.tctx.L().Warn("exit triggered", zap.String("failpoint", "ReSyncExit"))
  2099  				utils.OsExit(1)
  2100  			})
  2101  		}
  2102  		// for position mode, we can redirect at any time
  2103  		// for gtid mode, we can redirect only when current location related gtid's transaction is totally executed and
  2104  		//  next gtid is just updated (because we check if we can end resync by currLoc >= latestLoc)
  2105  		if s.cfg.ShardMode == config.ShardOptimistic {
  2106  			canRedirect := true
  2107  			if s.cfg.EnableGTID {
  2108  				canRedirect = safeToRedirect(lastEvent)
  2109  			} else if lastEvent != nil {
  2110  				if _, ok := lastEvent.Event.(*replication.QueryEvent); ok {
  2111  					if op := s.optimist.PendingOperation(); op != nil && op.ConflictStage == optimism.ConflictSkipWaitRedirect {
  2112  						canRedirect = false // don't redirect here at least after a row event
  2113  					}
  2114  				}
  2115  			}
  2116  			if canRedirect {
  2117  				op, targetTableID := s.optimist.PendingRedirectOperation()
  2118  				if op != nil {
  2119  					// for optimistic sharding mode, if a new sharding group is resolved when syncer is redirecting,
  2120  					// instead of using the endLocation, the next redirection should share the same latestLocation with the current shardingResync.
  2121  					// This is to avoid syncer syncs to current shardingResync.latestLocation before,
  2122  					// we may miss some rows events if we don't check the row events between endLocation and shardingResync.latestLocation.
  2123  					// TODO: This will cause a potential performance issue. If we have multiple tables not resolved after a huge amount of binlogs but resolved in a short time,
  2124  					//   	current implementation will cause syncer to redirect and replay the binlogs in this segment several times. One possible solution is to
  2125  					//      interrupt current resync once syncer meets a new redirect operation, force other tables to be resolved together in the interrupted shardingResync.
  2126  					//      If we want to do this, we also need to remove the target table check at https://github.com/pingcap/tiflow/blob/af849add84bf26feb2628d3e1e4344830b915fd9/dm/syncer/syncer.go#L2489
  2127  					endLocation := &endLocation
  2128  					if shardingReSync != nil {
  2129  						endLocation = &shardingReSync.latestLocation
  2130  					}
  2131  					resolved := s.resolveOptimisticDDL(&eventContext{
  2132  						shardingReSyncCh: &shardingReSyncCh,
  2133  						endLocation:      *endLocation,
  2134  					}, &filter.Table{Schema: op.UpSchema, Name: op.UpTable}, utils.UnpackTableID(targetTableID))
  2135  					// if resolved and targetTableID == shardingResync.TargetTableID, we should resolve this resync before we continue
  2136  					if resolved && shardingReSync != nil && targetTableID == shardingReSync.targetTable.String() {
  2137  						err = closeShardingResync()
  2138  						if err != nil {
  2139  							return err
  2140  						}
  2141  					}
  2142  					continue
  2143  				}
  2144  			}
  2145  		}
  2146  
  2147  		startTime := time.Now()
  2148  		e, op, err := s.streamerController.GetEvent(s.runCtx)
  2149  
  2150  		failpoint.Inject("SafeModeExit", func(val failpoint.Value) {
  2151  			if intVal, ok := val.(int); ok && intVal == 1 {
  2152  				s.tctx.L().Warn("fail to get event", zap.String("failpoint", "SafeModeExit"))
  2153  				err = errors.New("connect: connection refused")
  2154  			}
  2155  		})
  2156  		failpoint.Inject("GetEventErrorInTxn", func(val failpoint.Value) {
  2157  			if intVal, ok := val.(int); ok && intVal == eventIndex && failOnceForTest.CAS(false, true) {
  2158  				err = errors.New("failpoint triggered")
  2159  				s.tctx.L().Warn("failed to get event", zap.Int("event_index", eventIndex),
  2160  					zap.Any("cur_pos", endLocation), zap.Any("las_pos", lastTxnEndLocation),
  2161  					zap.Any("pos", e.Header.LogPos), log.ShortError(err))
  2162  			}
  2163  		})
  2164  		failpoint.Inject("SleepInTxn", func(val failpoint.Value) {
  2165  			if intVal, ok := val.(int); ok && intVal == eventIndex && failOnceForTest.CAS(false, true) {
  2166  				s.tctx.L().Warn("start to sleep 6s and continue for this event", zap.Int("event_index", eventIndex),
  2167  					zap.Any("cur_pos", endLocation), zap.Any("las_pos", lastTxnEndLocation),
  2168  					zap.Any("pos", e.Header.LogPos))
  2169  				time.Sleep(6 * time.Second)
  2170  			}
  2171  		})
  2172  		switch {
  2173  		case err == context.Canceled:
  2174  			s.tctx.L().Info("binlog replication main routine quit(context canceled)!", zap.Stringer("last location", lastTxnEndLocation))
  2175  			return nil
  2176  		case err == context.DeadlineExceeded:
  2177  			s.tctx.L().Info("deadline exceeded when fetching binlog event")
  2178  			continue
  2179  		// TODO: if we can maintain the lastTxnEndLocation inside streamerController, no need to expose this logic to syncer
  2180  		case isDuplicateServerIDError(err):
  2181  			// if the server id is already used, need to use a new server id
  2182  			s.tctx.L().Info("server id is already used by another slave, will change to a new server id and get event again")
  2183  			err1 := s.streamerController.UpdateServerIDAndResetReplication(s.tctx, lastTxnEndLocation)
  2184  			if err1 != nil {
  2185  				return err1
  2186  			}
  2187  			continue
  2188  		case err == relay.ErrorMaybeDuplicateEvent:
  2189  			s.tctx.L().Warn("read binlog met a truncated file, will skip events that has been consumed")
  2190  			err = maybeSkipNRowsEvent(eventIndex)
  2191  			if err == nil {
  2192  				continue
  2193  			}
  2194  			s.tctx.L().Warn("skip duplicate rows event failed", zap.Error(err))
  2195  		}
  2196  
  2197  		if err != nil {
  2198  			s.tctx.L().Error("fail to fetch binlog", log.ShortError(err))
  2199  
  2200  			if isConnectionRefusedError(err) {
  2201  				return err
  2202  			}
  2203  
  2204  			// TODO: if we can maintain the lastTxnEndLocation inside streamerController, no need to expose this logic to syncer
  2205  			if s.streamerController.CanRetry(err) {
  2206  				// GlobalPoint is the last finished transaction location
  2207  				err = s.streamerController.ResetReplicationSyncer(s.tctx, s.checkpoint.GlobalPoint())
  2208  				if err != nil {
  2209  					return err
  2210  				}
  2211  				s.tctx.L().Info("reset replication binlog puller", zap.Any("pos", s.checkpoint.GlobalPoint()))
  2212  				if err = maybeSkipNRowsEvent(eventIndex); err != nil {
  2213  					return err
  2214  				}
  2215  				continue
  2216  			}
  2217  
  2218  			return terror.ErrSyncerGetEvent.Generate(err)
  2219  		}
  2220  
  2221  		failpoint.Inject("IgnoreSomeTypeEvent", func(val failpoint.Value) {
  2222  			if e.Header.EventType.String() == val.(string) {
  2223  				s.tctx.L().Debug("IgnoreSomeTypeEvent", zap.Reflect("event", e))
  2224  				failpoint.Continue()
  2225  			}
  2226  		})
  2227  
  2228  		// time duration for reading an event from relay log or upstream master.
  2229  		s.metricsProxies.Metrics.BinlogReadDurationHistogram.Observe(time.Since(startTime).Seconds())
  2230  		startTime = time.Now() // reset start time for the next metric.
  2231  
  2232  		s.metricsProxies.Metrics.BinlogSyncerPosGauge.Set(float64(e.Header.LogPos))
  2233  		index, err := utils.GetFilenameIndex(lastTxnEndLocation.Position.Name)
  2234  		if err != nil {
  2235  			s.tctx.L().Warn("fail to get index number of binlog file, may because only specify GTID and hasn't saved according binlog position", log.ShortError(err))
  2236  		} else {
  2237  			s.metricsProxies.Metrics.BinlogSyncerFileGauge.Set(float64(index))
  2238  		}
  2239  		s.binlogSizeCount.Add(int64(e.Header.EventSize))
  2240  		s.metricsProxies.Metrics.BinlogEventSizeHistogram.Observe(float64(e.Header.EventSize))
  2241  
  2242  		failpoint.Inject("ProcessBinlogSlowDown", nil)
  2243  
  2244  		s.tctx.L().Debug("receive binlog event", zap.Reflect("header", e.Header))
  2245  
  2246  		startLocation := s.streamerController.GetCurStartLocation()
  2247  		endLocation = s.streamerController.GetCurEndLocation()
  2248  		lastTxnEndLocation = s.streamerController.GetTxnEndLocation()
  2249  
  2250  		if _, ok := e.Event.(*replication.GenericEvent); !ok {
  2251  			lastEvent = e
  2252  		}
  2253  
  2254  		switch op {
  2255  		case pb.ErrorOp_Skip:
  2256  			// try to handle pessimistic sharding?
  2257  			queryEvent, ok := e.Event.(*replication.QueryEvent)
  2258  			if !ok {
  2259  				s.tctx.L().Warn("can't skip an event which is not DDL", zap.Reflect("header", e.Header))
  2260  				break
  2261  			}
  2262  			ec := eventContext{
  2263  				tctx:          s.tctx,
  2264  				header:        e.Header,
  2265  				startLocation: startLocation,
  2266  				endLocation:   endLocation,
  2267  				lastLocation:  lastTxnEndLocation,
  2268  			}
  2269  			var sourceTbls map[string]map[string]struct{}
  2270  			sourceTbls, err = s.trackOriginDDL(queryEvent, ec)
  2271  			if err != nil {
  2272  				s.tctx.L().Warn("failed to track query when handle-error skip", zap.Error(err), zap.ByteString("sql", queryEvent.Query))
  2273  			}
  2274  
  2275  			s.saveGlobalPoint(endLocation)
  2276  			for sourceSchema, tableMap := range sourceTbls {
  2277  				if sourceSchema == "" {
  2278  					continue
  2279  				}
  2280  				for sourceTable := range tableMap {
  2281  					s.saveTablePoint(&filter.Table{Schema: sourceSchema, Name: sourceTable}, endLocation)
  2282  				}
  2283  			}
  2284  			err = s.flushJobs()
  2285  			if err != nil {
  2286  				s.tctx.L().Warn("failed to flush jobs when handle-error skip", zap.Error(err))
  2287  			} else {
  2288  				s.tctx.L().Info("flush jobs when handle-error skip")
  2289  			}
  2290  			continue
  2291  		}
  2292  
  2293  		// check pass SafeModeExitLoc and try to disable safe mode, but not in sharding or replacing error
  2294  		safeModeExitLoc := s.checkpoint.SafeModeExitPoint()
  2295  		if safeModeExitLoc != nil && shardingReSync == nil {
  2296  			if binlog.CompareLocation(endLocation, *safeModeExitLoc, s.cfg.EnableGTID) > 0 {
  2297  				s.checkpoint.SaveSafeModeExitPoint(nil)
  2298  				// must flush here to avoid the following situation:
  2299  				// 1. quit safe mode
  2300  				// 2. push forward and replicate some sqls after safeModeExitPoint to downstream
  2301  				// 3. quit because of network error, fail to flush global checkpoint and new safeModeExitPoint to downstream
  2302  				// 4. restart again, quit safe mode at safeModeExitPoint, but some sqls after this location have already been replicated to the downstream
  2303  				if err = s.checkpoint.FlushSafeModeExitPoint(s.runCtx); err != nil {
  2304  					return err
  2305  				}
  2306  				s.tctx.L().Info("disable safe mode in exit point")
  2307  				if err = s.safeMode.Add(s.runCtx, -1); err != nil {
  2308  					return err
  2309  				}
  2310  			}
  2311  		}
  2312  
  2313  		// set exitSafeModeTS when meet first binlog
  2314  		if s.firstMeetBinlogTS == nil && s.cliArgs != nil && s.cliArgs.SafeModeDuration != "" && int64(e.Header.Timestamp) != 0 && e.Header.EventType != replication.FORMAT_DESCRIPTION_EVENT {
  2315  			if checkErr := s.initSafeModeExitTS(int64(e.Header.Timestamp)); checkErr != nil {
  2316  				return checkErr
  2317  			}
  2318  		}
  2319  		// check if need to exit safe mode by binlog header TS
  2320  		if s.exitSafeModeTS != nil && shardingReSync == nil {
  2321  			if checkErr := s.checkAndExitSafeModeByBinlogTS(s.runCtx, int64(e.Header.Timestamp)); checkErr != nil {
  2322  				return checkErr
  2323  			}
  2324  		}
  2325  		ec := eventContext{
  2326  			tctx:                s.runCtx,
  2327  			header:              e.Header,
  2328  			startLocation:       startLocation,
  2329  			endLocation:         endLocation,
  2330  			lastLocation:        lastTxnEndLocation,
  2331  			shardingReSync:      shardingReSync,
  2332  			closeShardingResync: closeShardingResync,
  2333  			traceSource:         traceSource,
  2334  			safeMode:            s.safeMode.Enable(),
  2335  			startTime:           startTime,
  2336  			shardingReSyncCh:    &shardingReSyncCh,
  2337  		}
  2338  
  2339  		var originSQL string // show origin sql when error, only ddl now
  2340  		var err2 error
  2341  		var sourceTable *filter.Table
  2342  		var needContinue bool
  2343  		var eventType string
  2344  
  2345  		funcCommit := func() (bool, error) {
  2346  			// reset eventIndex and force safeMode flag here.
  2347  			eventIndex = 0
  2348  			for schemaName, tableMap := range affectedSourceTables {
  2349  				for table := range tableMap {
  2350  					s.saveTablePoint(&filter.Table{Schema: schemaName, Name: table}, endLocation)
  2351  				}
  2352  			}
  2353  			affectedSourceTables = make(map[string]map[string]struct{})
  2354  			if shardingReSync != nil {
  2355  				// TODO: why shardingReSync need to do so, shardingReSync need refactor
  2356  				shardingReSync.currLocation = endLocation
  2357  
  2358  				if binlog.CompareLocation(shardingReSync.currLocation, shardingReSync.latestLocation, s.cfg.EnableGTID) >= 0 {
  2359  					s.tctx.L().Info("re-replicate shard group was completed", zap.String("event", eventType), zap.Stringer("re-shard", shardingReSync))
  2360  					err = closeShardingResync()
  2361  					if err != nil {
  2362  						return false, terror.Annotatef(err, "shard group current location %s", shardingReSync.currLocation)
  2363  					}
  2364  					return true, nil
  2365  				}
  2366  			}
  2367  
  2368  			s.tctx.L().Debug("", zap.String("event", eventType), zap.Stringer("last location", lastTxnEndLocation), log.WrapStringerField("location", endLocation))
  2369  
  2370  			job := newXIDJob(endLocation, startLocation, endLocation)
  2371  			_, err = s.handleJobFunc(job)
  2372  			return false, err
  2373  		}
  2374  
  2375  		switch ev := e.Event.(type) {
  2376  		case *replication.RotateEvent:
  2377  			err2 = s.handleRotateEvent(ev, ec)
  2378  		case *replication.RowsEvent:
  2379  			eventIndex++
  2380  			s.metricsProxies.Metrics.BinlogEventRowHistogram.Observe(float64(len(ev.Rows)))
  2381  			sourceTable, err2 = s.handleRowsEvent(ev, ec)
  2382  			if sourceTable != nil && err2 == nil && s.cfg.EnableGTID {
  2383  				if _, ok := affectedSourceTables[sourceTable.Schema]; !ok {
  2384  					affectedSourceTables[sourceTable.Schema] = make(map[string]struct{})
  2385  				}
  2386  				affectedSourceTables[sourceTable.Schema][sourceTable.Name] = struct{}{}
  2387  			}
  2388  		case *replication.QueryEvent:
  2389  			originSQL = strings.TrimSpace(string(ev.Query))
  2390  			if originSQL == "COMMIT" {
  2391  				eventType = "COMMIT query event"
  2392  				needContinue, err2 = funcCommit()
  2393  				if needContinue {
  2394  					continue
  2395  				}
  2396  			} else {
  2397  				err2 = s.ddlWorker.HandleQueryEvent(ev, ec, originSQL)
  2398  			}
  2399  		case *replication.XIDEvent:
  2400  			eventType = "XID"
  2401  			needContinue, err2 = funcCommit()
  2402  			if needContinue {
  2403  				continue
  2404  			}
  2405  		case *replication.GenericEvent:
  2406  			if e.Header.EventType == replication.HEARTBEAT_EVENT {
  2407  				// flush checkpoint even if there are no real binlog events
  2408  				if s.checkpoint.LastFlushOutdated() {
  2409  					s.tctx.L().Info("meet heartbeat event and then flush jobs")
  2410  					err2 = s.flushJobs()
  2411  				}
  2412  			}
  2413  		case *replication.GTIDEvent, *replication.MariadbGTIDEvent:
  2414  			currentGTID, err2 = event.GetGTIDStr(e)
  2415  			if err2 != nil {
  2416  				return err2
  2417  			}
  2418  		case *replication.TransactionPayloadEvent:
  2419  			for _, tpev := range ev.Events {
  2420  				switch tpevt := tpev.Event.(type) {
  2421  				case *replication.RowsEvent:
  2422  					eventIndex++
  2423  					s.metricsProxies.Metrics.BinlogEventRowHistogram.Observe(float64(len(tpevt.Rows)))
  2424  					ec.header.EventType = tpev.Header.EventType
  2425  					sourceTable, err2 = s.handleRowsEvent(tpevt, ec)
  2426  					if sourceTable != nil && err2 == nil && s.cfg.EnableGTID {
  2427  						if _, ok := affectedSourceTables[sourceTable.Schema]; !ok {
  2428  							affectedSourceTables[sourceTable.Schema] = make(map[string]struct{})
  2429  						}
  2430  						affectedSourceTables[sourceTable.Schema][sourceTable.Name] = struct{}{}
  2431  					}
  2432  				case *replication.QueryEvent:
  2433  					originSQL = strings.TrimSpace(string(tpevt.Query))
  2434  					err2 = s.ddlWorker.HandleQueryEvent(tpevt, ec, originSQL)
  2435  				case *replication.XIDEvent:
  2436  					eventType = "XID"
  2437  					needContinue, err2 = funcCommit()
  2438  				case *replication.TableMapEvent:
  2439  				case *replication.FormatDescriptionEvent:
  2440  				default:
  2441  					s.tctx.L().Warn("unhandled event from transaction payload", zap.String("type", fmt.Sprintf("%T", tpevt)))
  2442  				}
  2443  			}
  2444  			if needContinue {
  2445  				continue
  2446  			}
  2447  		case *replication.TableMapEvent:
  2448  		case *replication.FormatDescriptionEvent:
  2449  		default:
  2450  			s.tctx.L().Warn("unhandled event", zap.String("type", fmt.Sprintf("%T", ev)))
  2451  		}
  2452  		if err2 != nil {
  2453  			if err := s.handleEventError(err2, startLocation, endLocation, e.Header.EventType == replication.QUERY_EVENT, originSQL); err != nil {
  2454  				return err
  2455  			}
  2456  		}
  2457  		if waitXIDStatus(s.waitXIDJob.Load()) == waitComplete {
  2458  			// already wait until XID event, we can stop sync now, s.runcancel will be called in defer func
  2459  			return nil
  2460  		}
  2461  	}
  2462  }
  2463  
  2464  func (s *Syncer) initSafeModeExitTS(firstBinlogTS int64) error {
  2465  	// see more in https://github.com/pingcap/tiflow/pull/4601#discussion_r814446628
  2466  	duration, err := time.ParseDuration(s.cliArgs.SafeModeDuration)
  2467  	if err != nil {
  2468  		return err
  2469  	}
  2470  	s.firstMeetBinlogTS = &firstBinlogTS
  2471  	exitTS := firstBinlogTS + int64(duration.Seconds())
  2472  	s.exitSafeModeTS = &exitTS
  2473  	s.tctx.L().Info("safe-mode will disable by task cli args", zap.Int64("ts", exitTS))
  2474  	return nil
  2475  }
  2476  
  2477  func (s *Syncer) checkAndExitSafeModeByBinlogTS(ctx *tcontext.Context, ts int64) error {
  2478  	if s.exitSafeModeTS != nil && ts > *s.exitSafeModeTS {
  2479  		if err := s.safeMode.Add(ctx, -1); err != nil {
  2480  			return err
  2481  		}
  2482  		s.exitSafeModeTS = nil
  2483  		if !s.safeMode.Enable() {
  2484  			s.tctx.L().Info("safe-mode disable by task cli args", zap.Int64("ts in binlog", ts))
  2485  		}
  2486  		// delete cliArgs in etcd
  2487  		clone := *s.cliArgs
  2488  		clone.SafeModeDuration = ""
  2489  		if err2 := ha.PutTaskCliArgs(s.cli, s.cfg.Name, []string{s.cfg.SourceID}, clone); err2 != nil {
  2490  			s.tctx.L().Error("failed to clean safe-mode-duration in task cli args", zap.Error(err2))
  2491  		} else {
  2492  			s.Lock()
  2493  			s.cliArgs.SafeModeDuration = ""
  2494  			s.Unlock()
  2495  		}
  2496  	}
  2497  	return nil
  2498  }
  2499  
  2500  type eventContext struct {
  2501  	tctx                *tcontext.Context
  2502  	header              *replication.EventHeader
  2503  	startLocation       binlog.Location
  2504  	endLocation         binlog.Location
  2505  	lastLocation        binlog.Location
  2506  	shardingReSync      *ShardingReSync
  2507  	closeShardingResync func() error
  2508  	traceSource         string
  2509  	// safeMode is the value of syncer.safeMode when process this event
  2510  	// syncer.safeMode's value may change on the fly, e.g. after event by pass the safeModeExitPoint
  2511  	safeMode         bool
  2512  	startTime        time.Time
  2513  	shardingReSyncCh *chan *ShardingReSync
  2514  }
  2515  
  2516  // TODO: Further split into smaller functions and group common arguments into a context struct.
  2517  func (s *Syncer) handleRotateEvent(ev *replication.RotateEvent, ec eventContext) error {
  2518  	failpoint.Inject("MakeFakeRotateEvent", func(val failpoint.Value) {
  2519  		ec.header.LogPos = 0
  2520  		ev.NextLogName = []byte(val.(string))
  2521  		ec.tctx.L().Info("MakeFakeRotateEvent", zap.String("fake file name", string(ev.NextLogName)))
  2522  	})
  2523  
  2524  	if utils.IsFakeRotateEvent(ec.header) {
  2525  		// when user starts a new task with GTID and no binlog file name, we can't know active relay log at init time
  2526  		// at this case, we update active relay log when receive fake rotate event
  2527  		if !s.recordedActiveRelayLog {
  2528  			if err := s.updateActiveRelayLog(mysql.Position{
  2529  				Name: string(ev.NextLogName),
  2530  				Pos:  uint32(ev.Position),
  2531  			}); err != nil {
  2532  				ec.tctx.L().Warn("failed to update active relay log, will try to update when flush checkpoint",
  2533  					zap.ByteString("NextLogName", ev.NextLogName),
  2534  					zap.Uint64("Position", ev.Position),
  2535  					zap.Error(err))
  2536  			} else {
  2537  				s.recordedActiveRelayLog = true
  2538  			}
  2539  		}
  2540  	}
  2541  
  2542  	if ec.shardingReSync != nil {
  2543  		if binlog.CompareLocation(ec.endLocation, ec.shardingReSync.currLocation, s.cfg.EnableGTID) > 0 {
  2544  			ec.shardingReSync.currLocation = ec.endLocation
  2545  		}
  2546  		if binlog.CompareLocation(ec.shardingReSync.currLocation, ec.shardingReSync.latestLocation, s.cfg.EnableGTID) >= 0 {
  2547  			ec.tctx.L().Info("re-replicate shard group was completed", zap.String("event", "rotate"), zap.Stringer("re-shard", ec.shardingReSync))
  2548  			err := ec.closeShardingResync()
  2549  			if err != nil {
  2550  				return err
  2551  			}
  2552  		} else {
  2553  			ec.tctx.L().Debug("re-replicate shard group", zap.String("event", "rotate"), log.WrapStringerField("location", ec.endLocation), zap.Reflect("re-shard", ec.shardingReSync))
  2554  		}
  2555  		return nil
  2556  	}
  2557  
  2558  	ec.tctx.L().Info("", zap.String("event", "rotate"), log.WrapStringerField("location", ec.endLocation))
  2559  	return nil
  2560  }
  2561  
  2562  // getFlushSeq is used for assigning an auto-incremental sequence number for each flush job.
  2563  func (s *Syncer) getFlushSeq() int64 {
  2564  	s.flushSeq++
  2565  	return s.flushSeq
  2566  }
  2567  
  2568  // handleRowsEvent returns (affected table, error), affected table means this table's event is committed to handleJob.
  2569  func (s *Syncer) handleRowsEvent(ev *replication.RowsEvent, ec eventContext) (*filter.Table, error) {
  2570  	sourceTable := &filter.Table{
  2571  		Schema: string(ev.Table.Schema),
  2572  		Name:   string(ev.Table.Table),
  2573  	}
  2574  	targetTable := s.route(sourceTable)
  2575  
  2576  	if ec.shardingReSync != nil {
  2577  		ec.shardingReSync.currLocation = ec.endLocation
  2578  		// When current progress has passed the latest location in re-sync, we can stop re-sync now.
  2579  		if binlog.CompareLocation(ec.shardingReSync.currLocation, ec.shardingReSync.latestLocation, s.cfg.EnableGTID) > 0 {
  2580  			ec.tctx.L().Info("re-replicate shard group was completed", zap.String("event", "row"), zap.Stringer("re-shard", ec.shardingReSync))
  2581  			return nil, ec.closeShardingResync()
  2582  		}
  2583  		if ec.shardingReSync.targetTable.String() != targetTable.String() {
  2584  			// in re-syncing, ignore non current sharding group's events
  2585  			ec.tctx.L().Debug("skip event in re-replicating shard group", zap.String("event", "row"), zap.Stringer("re-shard", ec.shardingReSync))
  2586  			return nil, nil
  2587  		}
  2588  	}
  2589  
  2590  	// For DML position before table checkpoint, ignore it. When the position equals to table checkpoint, this event may
  2591  	// be partially replicated to downstream, we rely on safe-mode to handle it.
  2592  	if s.checkpoint.IsOlderThanTablePoint(sourceTable, ec.endLocation) {
  2593  		ec.tctx.L().Debug("ignore obsolete event that is old than table checkpoint",
  2594  			zap.String("event", "row"),
  2595  			log.WrapStringerField("location", ec.endLocation),
  2596  			zap.Stringer("source table", sourceTable))
  2597  		return nil, nil
  2598  	}
  2599  
  2600  	ec.tctx.L().Debug("",
  2601  		zap.String("event", "row"),
  2602  		zap.Stringer("source table", sourceTable),
  2603  		zap.Stringer("target table", targetTable),
  2604  		log.WrapStringerField("location", ec.endLocation),
  2605  		zap.Reflect("raw event data", ev.Rows))
  2606  
  2607  	needSkip, err := s.skipRowsEvent(sourceTable, ec.header.EventType)
  2608  	if err != nil {
  2609  		return nil, err
  2610  	}
  2611  	if needSkip {
  2612  		s.metricsProxies.SkipBinlogDurationHistogram.WithLabelValues("rows", s.cfg.Name, s.cfg.SourceID).Observe(time.Since(ec.startTime).Seconds())
  2613  		// for RowsEvent, we should record lastLocation rather than endLocation
  2614  		return nil, s.recordSkipSQLsLocation(&ec)
  2615  	}
  2616  
  2617  	if (s.cfg.ShardMode == config.ShardPessimistic && s.sgk.InSyncing(sourceTable, targetTable, ec.endLocation)) ||
  2618  		(s.cfg.ShardMode == config.ShardOptimistic && s.osgk.inConflictStage(sourceTable, targetTable)) {
  2619  		// if in unsync stage and not before active DDL, filter it
  2620  		// if in sharding re-sync stage and not before active DDL (the next DDL to be synced), filter it
  2621  		ec.tctx.L().Debug("replicate sharding DDL, filter Rows event",
  2622  			zap.String("event", "row"),
  2623  			zap.Stringer("source", sourceTable),
  2624  			log.WrapStringerField("location", ec.endLocation))
  2625  		return nil, nil
  2626  	}
  2627  
  2628  	tableInfo, err := s.getTableInfo(ec.tctx, sourceTable, targetTable)
  2629  	if err != nil {
  2630  		return nil, terror.WithScope(err, terror.ScopeDownstream)
  2631  	}
  2632  	originRows := ev.Rows
  2633  	if err2 := checkLogColumns(ev.SkippedColumns); err2 != nil {
  2634  		return nil, err2
  2635  	}
  2636  
  2637  	extRows := generateExtendColumn(originRows, s.tableRouter, sourceTable, s.cfg.SourceID)
  2638  
  2639  	var dmls []*sqlmodel.RowChange
  2640  
  2641  	param := &genDMLParam{
  2642  		targetTable:     targetTable,
  2643  		originalData:    originRows,
  2644  		sourceTableInfo: tableInfo,
  2645  		sourceTable:     sourceTable,
  2646  		extendData:      extRows,
  2647  	}
  2648  
  2649  	switch ec.header.EventType {
  2650  	case replication.WRITE_ROWS_EVENTv0, replication.WRITE_ROWS_EVENTv1, replication.WRITE_ROWS_EVENTv2:
  2651  		exprFilter, err2 := s.exprFilterGroup.GetInsertExprs(sourceTable, tableInfo)
  2652  		if err2 != nil {
  2653  			return nil, err2
  2654  		}
  2655  
  2656  		param.safeMode = ec.safeMode
  2657  		dmls, err = s.genAndFilterInsertDMLs(ec.tctx, param, exprFilter)
  2658  		if err != nil {
  2659  			return nil, terror.Annotatef(err, "gen insert sqls failed, sourceTable: %v, targetTable: %v", sourceTable, targetTable)
  2660  		}
  2661  		s.metricsProxies.BinlogEventCost.WithLabelValues(metrics.BinlogEventCostStageGenWriteRows, s.cfg.Name, s.cfg.WorkerName, s.cfg.SourceID).Observe(time.Since(ec.startTime).Seconds())
  2662  
  2663  	case replication.UPDATE_ROWS_EVENTv0, replication.UPDATE_ROWS_EVENTv1, replication.UPDATE_ROWS_EVENTv2:
  2664  		oldExprFilter, newExprFilter, err2 := s.exprFilterGroup.GetUpdateExprs(sourceTable, tableInfo)
  2665  		if err2 != nil {
  2666  			return nil, err2
  2667  		}
  2668  
  2669  		param.safeMode = ec.safeMode
  2670  		dmls, err = s.genAndFilterUpdateDMLs(ec.tctx, param, oldExprFilter, newExprFilter)
  2671  		if err != nil {
  2672  			return nil, terror.Annotatef(err, "gen update sqls failed, sourceTable: %v, targetTable: %v", sourceTable, targetTable)
  2673  		}
  2674  		s.metricsProxies.BinlogEventCost.WithLabelValues(metrics.BinlogEventCostStageGenUpdateRows, s.cfg.Name, s.cfg.WorkerName, s.cfg.SourceID).Observe(time.Since(ec.startTime).Seconds())
  2675  
  2676  	case replication.DELETE_ROWS_EVENTv0, replication.DELETE_ROWS_EVENTv1, replication.DELETE_ROWS_EVENTv2:
  2677  		exprFilter, err2 := s.exprFilterGroup.GetDeleteExprs(sourceTable, tableInfo)
  2678  		if err2 != nil {
  2679  			return nil, err2
  2680  		}
  2681  
  2682  		dmls, err = s.genAndFilterDeleteDMLs(ec.tctx, param, exprFilter)
  2683  		if err != nil {
  2684  			return nil, terror.Annotatef(err, "gen delete sqls failed, sourceTable: %v, targetTable: %v", sourceTable, targetTable)
  2685  		}
  2686  		s.metricsProxies.BinlogEventCost.WithLabelValues(metrics.BinlogEventCostStageGenDeleteRows, s.cfg.Name, s.cfg.WorkerName, s.cfg.SourceID).Observe(time.Since(ec.startTime).Seconds())
  2687  
  2688  	default:
  2689  		ec.tctx.L().Debug("ignoring unrecognized event", zap.String("event", "row"), zap.Stringer("type", ec.header.EventType))
  2690  		return nil, nil
  2691  	}
  2692  
  2693  	startTime := time.Now()
  2694  
  2695  	metricTp := ""
  2696  	for i := range dmls {
  2697  		metricTp = dmlMetric[dmls[i].Type()]
  2698  		job := newDMLJob(dmls[i], &ec)
  2699  		added2Queue, err2 := s.handleJobFunc(job)
  2700  		if err2 != nil || !added2Queue {
  2701  			return nil, err2
  2702  		}
  2703  	}
  2704  	s.metricsProxies.DispatchBinlogDurationHistogram.WithLabelValues(metricTp, s.cfg.Name, s.cfg.SourceID).Observe(time.Since(startTime).Seconds())
  2705  
  2706  	if len(sourceTable.Schema) != 0 && !s.cfg.EnableGTID {
  2707  		// when in position-based replication, now events before table checkpoint is sent to queue. But in GTID-based
  2708  		// replication events may share one GTID, so event whose end position is equal to table checkpoint may not be
  2709  		// sent to queue. We may need event index in GTID to resolve it.
  2710  		// when in gtid-based replication, we can't save gtid to table point because this transaction may not
  2711  		// have been fully replicated to downstream. So we don't save table point here but record the affected tables and returns
  2712  		s.saveTablePoint(sourceTable, ec.endLocation)
  2713  	}
  2714  
  2715  	failpoint.Inject("flushFirstJob", func() {
  2716  		if waitJobsDoneForTest {
  2717  			s.tctx.L().Info("trigger flushFirstJob")
  2718  			waitJobsDoneForTest = false
  2719  
  2720  			err2 := s.flushJobs()
  2721  			if err2 != nil {
  2722  				s.tctx.L().DPanic("flush checkpoint failed", zap.Error(err2))
  2723  			}
  2724  			failpoint.Return(nil, nil)
  2725  		}
  2726  	})
  2727  
  2728  	return sourceTable, s.flushIfOutdated()
  2729  }
  2730  
  2731  type queryEventContext struct {
  2732  	*eventContext
  2733  
  2734  	p         *parser.Parser // used parser
  2735  	ddlSchema string         // used schema
  2736  	originSQL string         // before split
  2737  	// split multi-schema change DDL into multiple one schema change DDL due to TiDB's limitation
  2738  	splitDDLs      []string // after split before online ddl
  2739  	appliedDDLs    []string // after onlineDDL apply if onlineDDL != nil
  2740  	needHandleDDLs []string // after route
  2741  
  2742  	shardingDDLInfo *ddlInfo
  2743  	trackInfos      []*ddlInfo
  2744  	sourceTbls      map[string]map[string]struct{} // db name -> tb name
  2745  	onlineDDLTable  *filter.Table
  2746  	eventStatusVars []byte // binlog StatusVars
  2747  	timestamp       uint32
  2748  	timezone        string
  2749  }
  2750  
  2751  func (qec *queryEventContext) String() string {
  2752  	startLocation := qec.startLocation.String()
  2753  	currentLocation := qec.endLocation.String()
  2754  	lastLocation := qec.lastLocation.String()
  2755  	var needHandleDDLs, shardingReSync string
  2756  	if qec.needHandleDDLs != nil {
  2757  		needHandleDDLs = strings.Join(qec.needHandleDDLs, ",")
  2758  	}
  2759  	if qec.shardingReSync != nil {
  2760  		shardingReSync = qec.shardingReSync.String()
  2761  	}
  2762  	trackInfos := make([]string, 0, len(qec.trackInfos))
  2763  	for _, trackInfo := range qec.trackInfos {
  2764  		trackInfos = append(trackInfos, trackInfo.String())
  2765  	}
  2766  	return fmt.Sprintf("{schema: %s, originSQL: %s, startLocation: %s, endLocation: %s, lastLocation: %s, re-sync: %s, needHandleDDLs: %s, trackInfos: %s}",
  2767  		qec.ddlSchema, qec.originSQL, startLocation, currentLocation, lastLocation, shardingReSync, needHandleDDLs, strings.Join(trackInfos, ","))
  2768  }
  2769  
  2770  // generateExtendColumn generate extended columns by extractor.
  2771  func generateExtendColumn(data [][]interface{}, r *regexprrouter.RouteTable, table *filter.Table, sourceID string) [][]interface{} {
  2772  	extendCol, extendVal := r.FetchExtendColumn(table.Schema, table.Name, sourceID)
  2773  	if len(extendCol) == 0 {
  2774  		return nil
  2775  	}
  2776  
  2777  	rows := make([][]interface{}, len(data))
  2778  	for i := range data {
  2779  		rows[i] = data[i]
  2780  		for _, v := range extendVal {
  2781  			rows[i] = append(rows[i], v)
  2782  		}
  2783  	}
  2784  	return rows
  2785  }
  2786  
  2787  // trackDDL tracks ddl in schemaTracker.
  2788  func (s *Syncer) trackDDL(usedSchema string, trackInfo *ddlInfo, ec *eventContext) error {
  2789  	var (
  2790  		srcTables    = trackInfo.sourceTables
  2791  		targetTables = trackInfo.targetTables
  2792  		srcTable     = srcTables[0]
  2793  	)
  2794  
  2795  	// Make sure the needed tables are all loaded into the schema tracker.
  2796  	var (
  2797  		shouldExecDDLOnSchemaTracker bool
  2798  		shouldSchemaExist            bool
  2799  		shouldTableExistNum          int  // tableNames[:shouldTableExistNum] should exist
  2800  		shouldRefTableExistNum       int  // tableNames[1:shouldTableExistNum] should exist, since first one is "caller table"
  2801  		shouldReTrackDownstreamIndex bool // retrack downstreamIndex
  2802  	)
  2803  
  2804  	switch node := trackInfo.stmtCache.(type) {
  2805  	case *ast.CreateDatabaseStmt:
  2806  		shouldExecDDLOnSchemaTracker = true
  2807  	case *ast.AlterDatabaseStmt:
  2808  		shouldExecDDLOnSchemaTracker = true
  2809  		shouldSchemaExist = true
  2810  	case *ast.DropDatabaseStmt:
  2811  		shouldExecDDLOnSchemaTracker = true
  2812  		shouldReTrackDownstreamIndex = true
  2813  		if s.cfg.ShardMode == "" {
  2814  			if err := s.checkpoint.DeleteSchemaPoint(ec.tctx, srcTable.Schema); err != nil {
  2815  				return err
  2816  			}
  2817  		}
  2818  	case *ast.RecoverTableStmt:
  2819  		shouldExecDDLOnSchemaTracker = true
  2820  		shouldSchemaExist = true
  2821  	case *ast.CreateTableStmt, *ast.CreateViewStmt:
  2822  		shouldExecDDLOnSchemaTracker = true
  2823  		shouldSchemaExist = true
  2824  		// for CREATE TABLE LIKE/AS, the reference tables should exist
  2825  		shouldRefTableExistNum = len(srcTables)
  2826  	case *ast.DropTableStmt:
  2827  		shouldExecDDLOnSchemaTracker = true
  2828  		shouldReTrackDownstreamIndex = true
  2829  		if err := s.checkpoint.DeleteTablePoint(ec.tctx, srcTable); err != nil {
  2830  			return err
  2831  		}
  2832  	case *ast.RenameTableStmt, *ast.CreateIndexStmt, *ast.DropIndexStmt, *ast.RepairTableStmt:
  2833  		shouldExecDDLOnSchemaTracker = true
  2834  		shouldSchemaExist = true
  2835  		shouldTableExistNum = 1
  2836  		shouldReTrackDownstreamIndex = true
  2837  	case *ast.AlterTableStmt:
  2838  		shouldSchemaExist = true
  2839  		shouldReTrackDownstreamIndex = true
  2840  		// for DDL that adds FK, since TiDB doesn't fully support it yet, we simply ignore execution of this DDL.
  2841  		switch {
  2842  		case len(node.Specs) == 1 && node.Specs[0].Constraint != nil && node.Specs[0].Constraint.Tp == ast.ConstraintForeignKey:
  2843  			shouldTableExistNum = 1
  2844  			shouldExecDDLOnSchemaTracker = false
  2845  		case node.Specs[0].Tp == ast.AlterTableRenameTable:
  2846  			shouldTableExistNum = 1
  2847  			shouldExecDDLOnSchemaTracker = true
  2848  		default:
  2849  			shouldTableExistNum = len(srcTables)
  2850  			shouldExecDDLOnSchemaTracker = true
  2851  		}
  2852  	case *ast.LockTablesStmt, *ast.UnlockTablesStmt, *ast.CleanupTableLockStmt, *ast.TruncateTableStmt:
  2853  		break
  2854  	default:
  2855  		ec.tctx.L().DPanic("unhandled DDL type cannot be tracked", zap.Stringer("type", reflect.TypeOf(trackInfo.stmtCache)))
  2856  	}
  2857  
  2858  	if shouldReTrackDownstreamIndex {
  2859  		s.schemaTracker.RemoveDownstreamSchema(s.tctx, targetTables)
  2860  	}
  2861  
  2862  	if shouldSchemaExist {
  2863  		if err := s.schemaTracker.CreateSchemaIfNotExists(srcTable.Schema); err != nil {
  2864  			return terror.ErrSchemaTrackerCannotCreateSchema.Delegate(err, srcTable.Schema)
  2865  		}
  2866  	}
  2867  	for i := 0; i < shouldTableExistNum; i++ {
  2868  		if _, err := s.getTableInfo(ec.tctx, srcTables[i], targetTables[i]); err != nil {
  2869  			return err
  2870  		}
  2871  	}
  2872  	// skip getTable before in above loop
  2873  	start := 1
  2874  	if shouldTableExistNum > start {
  2875  		start = shouldTableExistNum
  2876  	}
  2877  	for i := start; i < shouldRefTableExistNum; i++ {
  2878  		if err := s.schemaTracker.CreateSchemaIfNotExists(srcTables[i].Schema); err != nil {
  2879  			return terror.ErrSchemaTrackerCannotCreateSchema.Delegate(err, srcTables[i].Schema)
  2880  		}
  2881  		if _, err := s.getTableInfo(ec.tctx, srcTables[i], targetTables[i]); err != nil {
  2882  			return err
  2883  		}
  2884  	}
  2885  
  2886  	if shouldExecDDLOnSchemaTracker {
  2887  		if err := s.schemaTracker.Exec(ec.tctx.Ctx, usedSchema, trackInfo.originStmt); err != nil {
  2888  			if ignoreTrackerDDLError(err) {
  2889  				ec.tctx.L().Warn("will ignore a DDL error when tracking",
  2890  					zap.String("schema", usedSchema),
  2891  					zap.String("statement", trackInfo.originDDL),
  2892  					log.WrapStringerField("location", ec.endLocation),
  2893  					log.ShortError(err))
  2894  				return nil
  2895  			}
  2896  			ec.tctx.L().Error("cannot track DDL",
  2897  				zap.String("schema", usedSchema),
  2898  				zap.String("statement", trackInfo.originDDL),
  2899  				log.WrapStringerField("location", ec.endLocation),
  2900  				log.ShortError(err))
  2901  			return terror.ErrSchemaTrackerCannotExecDDL.Delegate(err, trackInfo.originDDL)
  2902  		}
  2903  		s.exprFilterGroup.ResetExprs(srcTable)
  2904  	}
  2905  
  2906  	return nil
  2907  }
  2908  
  2909  func (s *Syncer) trackOriginDDL(ev *replication.QueryEvent, ec eventContext) (map[string]map[string]struct{}, error) {
  2910  	originSQL := strings.TrimSpace(string(ev.Query))
  2911  	if originSQL == "BEGIN" || originSQL == "COMMIT" || originSQL == "" || utils.IsBuildInSkipDDL(originSQL) {
  2912  		return nil, nil
  2913  	}
  2914  	var err error
  2915  	qec := &queryEventContext{
  2916  		eventContext:    &ec,
  2917  		ddlSchema:       string(ev.Schema),
  2918  		originSQL:       utils.TrimCtrlChars(originSQL),
  2919  		splitDDLs:       make([]string, 0),
  2920  		appliedDDLs:     make([]string, 0),
  2921  		sourceTbls:      make(map[string]map[string]struct{}),
  2922  		eventStatusVars: ev.StatusVars,
  2923  	}
  2924  	qec.p, err = event.GetParserForStatusVars(ev.StatusVars)
  2925  	if err != nil {
  2926  		s.tctx.L().Warn("found error when get sql_mode from binlog status_vars", zap.Error(err))
  2927  	}
  2928  	stmt, err := parseOneStmt(qec)
  2929  	if err != nil {
  2930  		// originSQL can't be parsed => can't be tracked by schema tracker
  2931  		// we can use operate-schema to set a compatible schema after this
  2932  		return nil, err
  2933  	}
  2934  
  2935  	if _, ok := stmt.(ast.DDLNode); !ok {
  2936  		return nil, nil
  2937  	}
  2938  
  2939  	// TiDB can't handle multi schema change DDL, so we split it here.
  2940  	qec.splitDDLs, err = parserpkg.SplitDDL(stmt, qec.ddlSchema)
  2941  	if err != nil {
  2942  		return nil, err
  2943  	}
  2944  
  2945  	for _, sql := range qec.splitDDLs {
  2946  		sqls, err := s.ddlWorker.processOneDDL(qec, sql)
  2947  		if err != nil {
  2948  			s.tctx.L().Warn("processOneDDL failed", zap.Error(err))
  2949  			qec.appliedDDLs = append(qec.appliedDDLs, sql)
  2950  		} else {
  2951  			qec.appliedDDLs = append(qec.appliedDDLs, sqls...)
  2952  		}
  2953  	}
  2954  
  2955  	affectedTbls := make(map[string]map[string]struct{})
  2956  	for _, sql := range qec.appliedDDLs {
  2957  		ddlInfo, err := s.ddlWorker.genDDLInfo(qec, sql)
  2958  		if err != nil {
  2959  			return nil, err
  2960  		}
  2961  		sourceTable := ddlInfo.sourceTables[0]
  2962  		switch ddlInfo.stmtCache.(type) {
  2963  		case *ast.DropDatabaseStmt:
  2964  			delete(affectedTbls, sourceTable.Schema)
  2965  		case *ast.DropTableStmt:
  2966  			if affectedTable, ok := affectedTbls[sourceTable.Schema]; ok {
  2967  				delete(affectedTable, sourceTable.Name)
  2968  			}
  2969  		default:
  2970  			if _, ok := affectedTbls[sourceTable.Schema]; !ok {
  2971  				affectedTbls[sourceTable.Schema] = make(map[string]struct{})
  2972  			}
  2973  			affectedTbls[sourceTable.Schema][sourceTable.Name] = struct{}{}
  2974  		}
  2975  		err = s.trackDDL(qec.ddlSchema, ddlInfo, qec.eventContext)
  2976  		if err != nil {
  2977  			return nil, err
  2978  		}
  2979  	}
  2980  
  2981  	return affectedTbls, nil
  2982  }
  2983  
  2984  func (s *Syncer) genRouter() error {
  2985  	s.tableRouter, _ = regexprrouter.NewRegExprRouter(s.cfg.CaseSensitive, []*router.TableRule{})
  2986  	for _, rule := range s.cfg.RouteRules {
  2987  		err := s.tableRouter.AddRule(rule)
  2988  		if err != nil {
  2989  			return terror.ErrSyncerUnitGenTableRouter.Delegate(err)
  2990  		}
  2991  	}
  2992  	return nil
  2993  }
  2994  
  2995  func (s *Syncer) loadTableStructureFromDump(ctx context.Context) error {
  2996  	logger := s.tctx.L()
  2997  	// TODO: delete this check after we support parallel reading the files to improve load speed
  2998  	if !storage.IsLocalDiskPath(s.cfg.LoaderConfig.Dir) {
  2999  		logger.Warn("skip load table structure from dump files for non-local-dir loader because it may be slow", zap.String("loaderDir", s.cfg.LoaderConfig.Dir))
  3000  		return nil
  3001  	}
  3002  	files, err := storage.CollectDirFiles(ctx, s.cfg.LoaderConfig.Dir, nil)
  3003  	if err != nil {
  3004  		logger.Warn("fail to get dump files", zap.Error(err))
  3005  		return err
  3006  	}
  3007  	var dbs, tables []string
  3008  	var tableFiles [][2]string // [db, filename]
  3009  	for f := range files {
  3010  		// TODO: handle db/table name escaped bu dumpling.
  3011  		if db, ok := utils.GetDBFromDumpFilename(f); ok {
  3012  			dbs = append(dbs, db)
  3013  			continue
  3014  		}
  3015  		if db, table, ok := utils.GetTableFromDumpFilename(f); ok {
  3016  			cols, _ := s.tableRouter.FetchExtendColumn(db, table, s.cfg.SourceID)
  3017  			if len(cols) > 0 {
  3018  				continue
  3019  			}
  3020  			tables = append(tables, dbutil.TableName(db, table))
  3021  			tableFiles = append(tableFiles, [2]string{db, f})
  3022  			continue
  3023  		}
  3024  	}
  3025  	logger.Info("fetch table structure from dump files",
  3026  		zap.Strings("database", dbs),
  3027  		zap.Any("tables", tables))
  3028  	for _, db := range dbs {
  3029  		if err = s.schemaTracker.CreateSchemaIfNotExists(db); err != nil {
  3030  			return err
  3031  		}
  3032  	}
  3033  
  3034  	var firstErr error
  3035  	setFirstErr := func(err error) {
  3036  		if firstErr == nil {
  3037  			firstErr = err
  3038  		}
  3039  	}
  3040  	p, err := conn.GetParserFromSQLModeStr(s.cfg.LoaderConfig.SQLMode)
  3041  	if err != nil {
  3042  		logger.Error("failed to create parser from SQL Mode, will skip loadTableStructureFromDump",
  3043  			zap.String("SQLMode", s.cfg.LoaderConfig.SQLMode),
  3044  			zap.Error(err))
  3045  		return err
  3046  	}
  3047  
  3048  	for _, dbAndFile := range tableFiles {
  3049  		db, file := dbAndFile[0], dbAndFile[1]
  3050  		content, err2 := storage.ReadFile(ctx, s.cfg.LoaderConfig.Dir, file, nil)
  3051  		if err2 != nil {
  3052  			logger.Warn("fail to read file for creating table in schema tracker",
  3053  				zap.String("db", db),
  3054  				zap.String("path", s.cfg.LoaderConfig.Dir),
  3055  				zap.String("file", file),
  3056  				zap.Error(err2))
  3057  			setFirstErr(err2)
  3058  			continue
  3059  		}
  3060  		stmts := bytes.Split(content, []byte(";\n"))
  3061  		for _, stmt := range stmts {
  3062  			stmt = bytes.TrimSpace(stmt)
  3063  			if len(stmt) == 0 || bytes.HasPrefix(stmt, []byte("/*")) {
  3064  				continue
  3065  			}
  3066  			stmtNode, err := p.ParseOneStmt(string(stmt), "", "")
  3067  			if err != nil {
  3068  				logger.Warn("fail to parse statement for creating table in schema tracker",
  3069  					zap.String("db", db),
  3070  					zap.String("path", s.cfg.LoaderConfig.Dir),
  3071  					zap.String("file", file),
  3072  					zap.ByteString("statement", stmt),
  3073  					zap.Error(err))
  3074  				setFirstErr(err)
  3075  				continue
  3076  			}
  3077  			switch v := stmtNode.(type) {
  3078  			case *ast.SetStmt:
  3079  				logger.Warn("ignoring statement",
  3080  					zap.String("type", fmt.Sprintf("%T", v)),
  3081  					zap.ByteString("statement", stmt))
  3082  			case *ast.CreateTableStmt:
  3083  				err = s.schemaTracker.Exec(ctx, db, stmtNode)
  3084  				if err != nil {
  3085  					logger.Warn("fail to create table for dump files",
  3086  						zap.Any("path", s.cfg.LoaderConfig.Dir),
  3087  						zap.Any("file", file),
  3088  						zap.ByteString("statement", stmt),
  3089  						zap.Error(err))
  3090  					setFirstErr(err)
  3091  					continue
  3092  				}
  3093  				s.saveTablePoint(
  3094  					&filter.Table{Schema: db, Name: v.Table.Name.O},
  3095  					s.getFlushedGlobalPoint(),
  3096  				)
  3097  			default:
  3098  				err = s.schemaTracker.Exec(ctx, db, stmtNode)
  3099  				if err != nil {
  3100  					logger.Warn("fail to create table for dump files",
  3101  						zap.Any("path", s.cfg.LoaderConfig.Dir),
  3102  						zap.Any("file", file),
  3103  						zap.ByteString("statement", stmt),
  3104  						zap.Error(err))
  3105  					setFirstErr(err)
  3106  					continue
  3107  				}
  3108  			}
  3109  		}
  3110  	}
  3111  	return firstErr
  3112  }
  3113  
  3114  func (s *Syncer) createDBs(ctx context.Context) error {
  3115  	var err error
  3116  	dbCfg := s.cfg.From
  3117  	dbCfg.RawDBCfg = dbconfig.DefaultRawDBConfig().SetReadTimeout(maxDMLConnectionTimeout)
  3118  	fromDB, fromConns, err := dbconn.CreateConns(s.tctx, s.cfg, conn.UpstreamDBConfig(&dbCfg), 1, s.cfg.DumpIOTotalBytes, s.cfg.DumpUUID)
  3119  	if err != nil {
  3120  		return err
  3121  	}
  3122  	s.fromDB = &dbconn.UpStreamConn{BaseDB: fromDB}
  3123  	s.fromConn = fromConns[0]
  3124  	baseConn, err := s.fromDB.BaseDB.GetBaseConn(ctx)
  3125  	if err != nil {
  3126  		return err
  3127  	}
  3128  	lcFlavor, err := conn.FetchLowerCaseTableNamesSetting(ctx, baseConn)
  3129  	if err != nil {
  3130  		return err
  3131  	}
  3132  	s.SourceTableNamesFlavor = lcFlavor
  3133  
  3134  	hasSQLMode := false
  3135  	// get sql_mode from upstream db
  3136  	if s.cfg.To.Session == nil {
  3137  		s.cfg.To.Session = make(map[string]string)
  3138  	} else {
  3139  		for k := range s.cfg.To.Session {
  3140  			if strings.ToLower(k) == "sql_mode" {
  3141  				hasSQLMode = true
  3142  				break
  3143  			}
  3144  		}
  3145  	}
  3146  	if !hasSQLMode {
  3147  		sqlMode, err2 := conn.GetGlobalVariable(tcontext.NewContext(ctx, log.L()), s.fromDB.BaseDB, "sql_mode")
  3148  		if err2 != nil {
  3149  			s.tctx.L().Warn("cannot get sql_mode from upstream database, the sql_mode will be assigned \"IGNORE_SPACE, NO_AUTO_VALUE_ON_ZERO, ALLOW_INVALID_DATES\"", log.ShortError(err2))
  3150  		}
  3151  		sqlModes, err3 := conn.AdjustSQLModeCompatible(sqlMode)
  3152  		if err3 != nil {
  3153  			s.tctx.L().Warn("cannot adjust sql_mode compatible, the sql_mode will be assigned  stay the same", log.ShortError(err3))
  3154  		}
  3155  		s.cfg.To.Session["sql_mode"] = sqlModes
  3156  	}
  3157  
  3158  	dbCfg = s.cfg.To
  3159  	dbCfg.RawDBCfg = dbconfig.DefaultRawDBConfig().
  3160  		SetReadTimeout(maxDMLConnectionTimeout).
  3161  		SetMaxIdleConns(s.cfg.WorkerCount)
  3162  
  3163  	s.toDB, s.toDBConns, err = dbconn.CreateConns(s.tctx, s.cfg, conn.DownstreamDBConfig(&dbCfg), s.cfg.WorkerCount, s.cfg.IOTotalBytes, s.cfg.UUID)
  3164  	if err != nil {
  3165  		dbconn.CloseUpstreamConn(s.tctx, s.fromDB) // release resources acquired before return with error
  3166  		return err
  3167  	}
  3168  	// baseConn for ddl
  3169  	dbCfg = s.cfg.To
  3170  	dbCfg.RawDBCfg = dbconfig.DefaultRawDBConfig().SetReadTimeout(maxDDLConnectionTimeout)
  3171  
  3172  	var ddlDBConns []*dbconn.DBConn
  3173  	s.ddlDB, ddlDBConns, err = dbconn.CreateConns(s.tctx, s.cfg, conn.DownstreamDBConfig(&dbCfg), 2, s.cfg.IOTotalBytes, s.cfg.UUID)
  3174  	if err != nil {
  3175  		dbconn.CloseUpstreamConn(s.tctx, s.fromDB)
  3176  		dbconn.CloseBaseDB(s.tctx, s.toDB)
  3177  		return err
  3178  	}
  3179  	s.ddlDBConn = ddlDBConns[0]
  3180  	s.downstreamTrackConn = ddlDBConns[1]
  3181  	printServerVersion(s.tctx, s.fromDB.BaseDB, "upstream")
  3182  	printServerVersion(s.tctx, s.toDB, "downstream")
  3183  
  3184  	return nil
  3185  }
  3186  
  3187  // closeBaseDB closes all opened DBs, rollback for createConns.
  3188  func (s *Syncer) closeDBs() {
  3189  	dbconn.CloseUpstreamConn(s.tctx, s.fromDB)
  3190  	dbconn.CloseBaseDB(s.tctx, s.toDB)
  3191  	dbconn.CloseBaseDB(s.tctx, s.ddlDB)
  3192  }
  3193  
  3194  // record skip ddl/dml sqls' position
  3195  // make newJob's sql argument empty to distinguish normal sql and skips sql.
  3196  func (s *Syncer) recordSkipSQLsLocation(ec *eventContext) error {
  3197  	job := newSkipJob(ec)
  3198  	_, err := s.handleJobFunc(job)
  3199  	return err
  3200  }
  3201  
  3202  // flushJobs add a flush job and wait for all jobs finished.
  3203  // NOTE: currently, flush job is always sync operation.
  3204  func (s *Syncer) flushJobs() error {
  3205  	flushJobSeq := s.getFlushSeq()
  3206  	s.tctx.L().Info("flush all jobs", zap.Stringer("global checkpoint", s.checkpoint), zap.Int64("flush job seq", flushJobSeq))
  3207  	job := newFlushJob(s.cfg.WorkerCount, flushJobSeq)
  3208  	_, err := s.handleJobFunc(job)
  3209  	return err
  3210  }
  3211  
  3212  func (s *Syncer) route(table *filter.Table) *filter.Table {
  3213  	return route(s.tableRouter, table)
  3214  }
  3215  
  3216  func route(tableRouter *regexprrouter.RouteTable, table *filter.Table) *filter.Table {
  3217  	if table.Schema == "" {
  3218  		return table
  3219  	}
  3220  	// nolint:errcheck
  3221  	targetSchema, targetTable, _ := tableRouter.Route(table.Schema, table.Name)
  3222  	if targetSchema == "" {
  3223  		return table
  3224  	}
  3225  	if targetTable == "" {
  3226  		targetTable = table.Name
  3227  	}
  3228  
  3229  	return &filter.Table{Schema: targetSchema, Name: targetTable}
  3230  }
  3231  
  3232  func (s *Syncer) IsRunning() bool {
  3233  	return s.running.Load()
  3234  }
  3235  
  3236  func (s *Syncer) isClosed() bool {
  3237  	return s.closed.Load()
  3238  }
  3239  
  3240  // Close closes syncer.
  3241  func (s *Syncer) Close() {
  3242  	s.Lock()
  3243  	defer s.Unlock()
  3244  
  3245  	if s.isClosed() {
  3246  		return
  3247  	}
  3248  	s.stopSync()
  3249  	s.closeDBs()
  3250  	s.checkpoint.Close()
  3251  	s.schemaTracker.Close()
  3252  	if s.sgk != nil {
  3253  		s.sgk.Close()
  3254  	}
  3255  	s.closeOnlineDDL()
  3256  	// when closing syncer by `stop-task`, remove active relay log from hub
  3257  	s.removeActiveRelayLog()
  3258  	s.metricsProxies.RemoveLabelValuesWithTaskInMetrics(s.cfg.Name)
  3259  
  3260  	s.runWg.Wait()
  3261  	s.closed.Store(true)
  3262  }
  3263  
  3264  // Kill kill syncer without graceful.
  3265  func (s *Syncer) Kill() {
  3266  	s.tctx.L().Warn("kill syncer without graceful")
  3267  	s.runCancel()
  3268  	s.syncCancel()
  3269  	s.Close()
  3270  }
  3271  
  3272  // stopSync stops stream and rollbacks checkpoint. Now it's used by Close() and Pause().
  3273  func (s *Syncer) stopSync() {
  3274  	// before re-write workflow for s.syncer, simply close it
  3275  	// when resuming, re-create s.syncer
  3276  
  3277  	if s.streamerController != nil {
  3278  		s.streamerController.Close()
  3279  	}
  3280  
  3281  	// try to rollback checkpoints, if they already flushed, no effect, this operation should call before close schemaTracker
  3282  	prePos := s.checkpoint.GlobalPoint()
  3283  	s.checkpoint.Rollback()
  3284  	currPos := s.checkpoint.GlobalPoint()
  3285  	if binlog.CompareLocation(prePos, currPos, s.cfg.EnableGTID) != 0 {
  3286  		s.tctx.L().Warn("rollback global checkpoint", zap.Stringer("previous position", prePos), zap.Stringer("current position", currPos))
  3287  	}
  3288  }
  3289  
  3290  func (s *Syncer) closeOnlineDDL() {
  3291  	if s.onlineDDL != nil {
  3292  		s.onlineDDL.Close()
  3293  		s.onlineDDL = nil
  3294  	}
  3295  }
  3296  
  3297  // Pause implements Unit.Pause.
  3298  func (s *Syncer) Pause() {
  3299  	if s.isClosed() {
  3300  		s.tctx.L().Warn("try to pause, but already closed")
  3301  		return
  3302  	}
  3303  	s.stopSync()
  3304  	s.schemaTracker.Close()
  3305  }
  3306  
  3307  // Resume resumes the paused process.
  3308  func (s *Syncer) Resume(ctx context.Context, pr chan pb.ProcessResult) {
  3309  	var err error
  3310  	defer func() {
  3311  		if err != nil {
  3312  			processError := unit.NewProcessError(err)
  3313  			s.handleExitErrMetric(processError)
  3314  			pr <- pb.ProcessResult{
  3315  				IsCanceled: false,
  3316  				Errors: []*pb.ProcessError{
  3317  					processError,
  3318  				},
  3319  			}
  3320  		}
  3321  	}()
  3322  	if s.isClosed() {
  3323  		s.tctx.L().Warn("try to resume, but already closed")
  3324  		return
  3325  	}
  3326  
  3327  	// continue the processing
  3328  	s.reset()
  3329  	// reset database conns
  3330  	err = s.resetDBs(s.tctx.WithContext(ctx))
  3331  	if err != nil {
  3332  		return
  3333  	}
  3334  
  3335  	s.Process(ctx, pr)
  3336  }
  3337  
  3338  // CheckCanUpdateCfg check if task config can be updated.
  3339  // 1. task must not in a pessimistic ddl state.
  3340  // 2. only balist, route/filter rules and syncerConfig can be updated at this moment.
  3341  // 3. some config fields from sourceCfg also can be updated, see more in func `copyConfigFromSource`.
  3342  func (s *Syncer) CheckCanUpdateCfg(newCfg *config.SubTaskConfig) error {
  3343  	s.RLock()
  3344  	defer s.RUnlock()
  3345  	// can't update when in sharding merge
  3346  	if s.cfg.ShardMode == config.ShardPessimistic {
  3347  		_, tables := s.sgk.UnresolvedTables()
  3348  		if len(tables) > 0 {
  3349  			return terror.ErrSyncerUnitUpdateConfigInSharding.Generate(tables)
  3350  		}
  3351  	}
  3352  
  3353  	oldCfg, err := s.cfg.Clone()
  3354  	if err != nil {
  3355  		return err
  3356  	}
  3357  	oldCfg.BAList = newCfg.BAList
  3358  	oldCfg.BWList = newCfg.BWList
  3359  	oldCfg.RouteRules = newCfg.RouteRules
  3360  	oldCfg.FilterRules = newCfg.FilterRules
  3361  	oldCfg.SyncerConfig = newCfg.SyncerConfig
  3362  	oldCfg.To.Session = newCfg.To.Session // session is adjusted in `createDBs`
  3363  
  3364  	// support fields that changed in func `copyConfigFromSource`
  3365  	oldCfg.From = newCfg.From
  3366  	oldCfg.Flavor = newCfg.Flavor
  3367  	oldCfg.ServerID = newCfg.ServerID
  3368  	oldCfg.RelayDir = newCfg.RelayDir
  3369  	oldCfg.UseRelay = newCfg.UseRelay
  3370  	oldCfg.EnableGTID = newCfg.EnableGTID
  3371  	oldCfg.CaseSensitive = newCfg.CaseSensitive
  3372  
  3373  	if oldCfg.String() != newCfg.String() {
  3374  		s.tctx.L().Warn("can not update cfg", zap.Stringer("old cfg", oldCfg), zap.Stringer("new cfg", newCfg))
  3375  		return terror.ErrWorkerUpdateSubTaskConfig.Generatef("can't update subtask config for syncer because new config contains some fields that should not be changed, task: %s", s.cfg.Name)
  3376  	}
  3377  	return nil
  3378  }
  3379  
  3380  // Update implements Unit.Update
  3381  // now, only support to update config for routes, filters, column-mappings, block-allow-list
  3382  // now no config diff implemented, so simply re-init use new config.
  3383  func (s *Syncer) Update(ctx context.Context, cfg *config.SubTaskConfig) error {
  3384  	s.Lock()
  3385  	defer s.Unlock()
  3386  	if s.cfg.ShardMode == config.ShardPessimistic {
  3387  		_, tables := s.sgk.UnresolvedTables()
  3388  		if len(tables) > 0 {
  3389  			return terror.ErrSyncerUnitUpdateConfigInSharding.Generate(tables)
  3390  		}
  3391  	}
  3392  
  3393  	var (
  3394  		err             error
  3395  		oldBaList       *filter.Filter
  3396  		oldTableRouter  *regexprrouter.RouteTable
  3397  		oldBinlogFilter *bf.BinlogEvent
  3398  	)
  3399  
  3400  	defer func() {
  3401  		if err == nil {
  3402  			return
  3403  		}
  3404  		if oldBaList != nil {
  3405  			s.baList = oldBaList
  3406  		}
  3407  		if oldTableRouter != nil {
  3408  			s.tableRouter = oldTableRouter
  3409  		}
  3410  		if oldBinlogFilter != nil {
  3411  			s.binlogFilter = oldBinlogFilter
  3412  		}
  3413  	}()
  3414  
  3415  	// update block-allow-list
  3416  	oldBaList = s.baList
  3417  	s.baList, err = filter.New(cfg.CaseSensitive, cfg.BAList)
  3418  	if err != nil {
  3419  		return terror.ErrSyncerUnitGenBAList.Delegate(err)
  3420  	}
  3421  
  3422  	// update route
  3423  	oldTableRouter = s.tableRouter
  3424  	s.tableRouter, err = regexprrouter.NewRegExprRouter(cfg.CaseSensitive, cfg.RouteRules)
  3425  	if err != nil {
  3426  		return terror.ErrSyncerUnitGenTableRouter.Delegate(err)
  3427  	}
  3428  
  3429  	// update binlog filter
  3430  	oldBinlogFilter = s.binlogFilter
  3431  	s.binlogFilter, err = bf.NewBinlogEvent(cfg.CaseSensitive, cfg.FilterRules)
  3432  	if err != nil {
  3433  		return terror.ErrSyncerUnitGenBinlogEventFilter.Delegate(err)
  3434  	}
  3435  
  3436  	switch s.cfg.ShardMode {
  3437  	case config.ShardPessimistic:
  3438  		// re-init sharding group
  3439  		err = s.sgk.Init()
  3440  		if err != nil {
  3441  			return err
  3442  		}
  3443  
  3444  		err = s.initShardingGroups(context.Background(), false) // FIXME: fix context when re-implementing `Update`
  3445  		if err != nil {
  3446  			return err
  3447  		}
  3448  	case config.ShardOptimistic:
  3449  		err = s.initOptimisticShardDDL(context.Background()) // FIXME: fix context when re-implementing `Update`
  3450  		if err != nil {
  3451  			return err
  3452  		}
  3453  	}
  3454  
  3455  	// update l.cfg
  3456  	s.cfg.BAList = cfg.BAList
  3457  	s.cfg.RouteRules = cfg.RouteRules
  3458  	s.cfg.FilterRules = cfg.FilterRules
  3459  	s.cfg.ColumnMappingRules = cfg.ColumnMappingRules
  3460  
  3461  	// update timezone
  3462  	if s.timezone == nil {
  3463  		s.timezone, _, err = str2TimezoneOrFromDB(s.tctx.WithContext(ctx), s.cfg.Timezone, conn.DownstreamDBConfig(&s.cfg.To))
  3464  		return err
  3465  	}
  3466  	// update syncer config
  3467  	s.cfg.SyncerConfig = cfg.SyncerConfig
  3468  
  3469  	// updated fileds that changed in func `copyConfigFromSource`
  3470  	s.cfg.From = cfg.From
  3471  	s.cfg.Flavor = cfg.Flavor
  3472  	s.cfg.ServerID = cfg.ServerID
  3473  	s.cfg.RelayDir = cfg.RelayDir
  3474  	s.cfg.UseRelay = cfg.UseRelay
  3475  	s.cfg.EnableGTID = cfg.EnableGTID
  3476  	s.cfg.CaseSensitive = cfg.CaseSensitive
  3477  	return nil
  3478  }
  3479  
  3480  // checkpointID returns ID which used for checkpoint table.
  3481  func (s *Syncer) checkpointID() string {
  3482  	if len(s.cfg.SourceID) > 0 {
  3483  		return s.cfg.SourceID
  3484  	}
  3485  	return strconv.FormatUint(uint64(s.cfg.ServerID), 10)
  3486  }
  3487  
  3488  // ShardDDLOperation returns the current pending to handle shard DDL lock operation.
  3489  func (s *Syncer) ShardDDLOperation() *pessimism.Operation {
  3490  	return s.pessimist.PendingOperation()
  3491  }
  3492  
  3493  func (s *Syncer) setErrLocation(startLocation, endLocation *binlog.Location, isQueryEventEvent bool) {
  3494  	s.errLocation.Lock()
  3495  	defer s.errLocation.Unlock()
  3496  
  3497  	s.errLocation.isQueryEvent = isQueryEventEvent
  3498  	if s.errLocation.startLocation == nil || startLocation == nil {
  3499  		s.errLocation.startLocation = startLocation
  3500  	} else if binlog.CompareLocation(*startLocation, *s.errLocation.startLocation, s.cfg.EnableGTID) < 0 {
  3501  		s.errLocation.startLocation = startLocation
  3502  	}
  3503  
  3504  	if s.errLocation.endLocation == nil || endLocation == nil {
  3505  		s.errLocation.endLocation = endLocation
  3506  	} else if binlog.CompareLocation(*endLocation, *s.errLocation.endLocation, s.cfg.EnableGTID) < 0 {
  3507  		s.errLocation.endLocation = endLocation
  3508  	}
  3509  }
  3510  
  3511  func (s *Syncer) getErrLocation() (*binlog.Location, bool) {
  3512  	s.errLocation.Lock()
  3513  	defer s.errLocation.Unlock()
  3514  	return s.errLocation.startLocation, s.errLocation.isQueryEvent
  3515  }
  3516  
  3517  func (s *Syncer) handleEventError(err error, startLocation, endLocation binlog.Location, isQueryEvent bool, originSQL string) error {
  3518  	if err == nil {
  3519  		return nil
  3520  	}
  3521  	s.setErrLocation(&startLocation, &endLocation, isQueryEvent)
  3522  	if len(originSQL) > 0 {
  3523  		return terror.Annotatef(err, "startLocation: [%s], endLocation: [%s], origin SQL: [%s]", startLocation, endLocation, originSQL)
  3524  	}
  3525  	return terror.Annotatef(err, "startLocation: [%s], endLocation: [%s]", startLocation, endLocation)
  3526  }
  3527  
  3528  func (s *Syncer) adjustGlobalPointGTID(tctx *tcontext.Context) (bool, error) {
  3529  	location := s.checkpoint.GlobalPoint()
  3530  	// situations that don't need to adjust
  3531  	// 1. GTID is not enabled
  3532  	// 2. location already has GTID position
  3533  	// 3. location is totally new, has no position info
  3534  	// 4. location is too early thus not a COMMIT location, which happens when it's reset by other logic
  3535  	if !s.cfg.EnableGTID || !gtid.CheckGTIDSetEmpty(location.GetGTID()) || location.Position.Name == "" || location.Position.Pos == 4 {
  3536  		return false, nil
  3537  	}
  3538  	// set enableGTID to false for new streamerController
  3539  	streamerController := binlogstream.NewStreamerController(
  3540  		s.syncCfg,
  3541  		false,
  3542  		s.fromDB,
  3543  		s.cfg.RelayDir,
  3544  		s.timezone,
  3545  		s.relay,
  3546  		s.tctx.L(),
  3547  	)
  3548  
  3549  	endPos := binlog.RemoveRelaySubDirSuffix(location.Position)
  3550  	startPos := mysql.Position{
  3551  		Name: endPos.Name,
  3552  		Pos:  0,
  3553  	}
  3554  	startLocation := location.Clone()
  3555  	startLocation.Position = startPos
  3556  
  3557  	err := streamerController.Start(tctx, startLocation)
  3558  	if err != nil {
  3559  		return false, err
  3560  	}
  3561  	defer streamerController.Close()
  3562  
  3563  	gs, err := reader.GetGTIDsForPosFromStreamer(tctx.Context(), streamerController.GetStreamer(), endPos)
  3564  	if err != nil {
  3565  		s.tctx.L().Warn("fail to get gtids for global location", zap.Stringer("pos", location), zap.Error(err))
  3566  		return false, err
  3567  	}
  3568  	dbConn, err := s.fromDB.BaseDB.GetBaseConn(tctx.Context())
  3569  	if err != nil {
  3570  		s.tctx.L().Warn("fail to build connection", zap.Stringer("pos", location), zap.Error(err))
  3571  		return false, err
  3572  	}
  3573  	gs, err = conn.AddGSetWithPurged(tctx.Context(), gs, dbConn)
  3574  	if err != nil {
  3575  		s.tctx.L().Warn("fail to merge purged gtidSet", zap.Stringer("pos", location), zap.Error(err))
  3576  		return false, err
  3577  	}
  3578  	err = location.SetGTID(gs)
  3579  	if err != nil {
  3580  		s.tctx.L().Warn("fail to set gtid for global location", zap.Stringer("pos", location),
  3581  			zap.String("adjusted_gtid", gs.String()), zap.Error(err))
  3582  		return false, err
  3583  	}
  3584  	s.saveGlobalPoint(location)
  3585  	// redirect streamer for new gtid set location
  3586  	err = s.streamerController.ResetReplicationSyncer(tctx, location)
  3587  	if err != nil {
  3588  		s.tctx.L().Warn("fail to redirect streamer for global location", zap.Stringer("pos", location),
  3589  			zap.String("adjusted_gtid", gs.String()), zap.Error(err))
  3590  		return false, err
  3591  	}
  3592  	return true, nil
  3593  }
  3594  
  3595  // delLoadTask is called when finish restoring data, to delete load worker in etcd.
  3596  func (s *Syncer) delLoadTask() error {
  3597  	if s.cli == nil {
  3598  		return nil
  3599  	}
  3600  	_, _, err := ha.DelLoadTask(s.cli, s.cfg.Name, s.cfg.SourceID)
  3601  	if err != nil {
  3602  		return err
  3603  	}
  3604  	s.tctx.Logger.Info("delete load worker in etcd for all mode", zap.String("task", s.cfg.Name), zap.String("source", s.cfg.SourceID))
  3605  	return nil
  3606  }
  3607  
  3608  // DM originally cached s.cfg.QueueSize * s.cfg.WorkerCount dml jobs in memory in 2.0.X.
  3609  // Now if compact: false, dmlJobCh and dmlWorker will both cached s.cfg.QueueSize * s.cfg.WorkerCount/2 jobs.
  3610  // If compact: true, dmlJobCh, compactor buffer, compactor output channel and dmlWorker will all cached s.cfg.QueueSize * s.cfg.WorkerCount/4 jobs.
  3611  func calculateChanSize(queueSize, workerCount int, compact bool) int {
  3612  	chanSize := queueSize * workerCount / 2
  3613  	if compact {
  3614  		chanSize /= 2
  3615  	}
  3616  	return chanSize
  3617  }
  3618  
  3619  func (s *Syncer) setGlobalPointByTime(tctx *tcontext.Context, timeStr string) error {
  3620  	t, err := utils.ParseStartTimeInLoc(timeStr, s.upstreamTZ)
  3621  	if err != nil {
  3622  		return err
  3623  	}
  3624  
  3625  	var (
  3626  		loc   *binlog.Location
  3627  		posTp binlog.PosType
  3628  	)
  3629  
  3630  	if s.relay != nil {
  3631  		subDir := s.relay.Status(nil).(*pb.RelayStatus).RelaySubDir
  3632  		relayDir := path.Join(s.cfg.RelayDir, subDir)
  3633  		finder := binlog.NewLocalBinlogPosFinder(tctx, s.cfg.EnableGTID, s.cfg.Flavor, relayDir)
  3634  		loc, posTp, err = finder.FindByTimestamp(t.Unix())
  3635  	} else {
  3636  		finder := binlog.NewRemoteBinlogPosFinder(tctx, s.fromDB.BaseDB, s.syncCfg, s.cfg.EnableGTID)
  3637  		loc, posTp, err = finder.FindByTimestamp(t.Unix())
  3638  	}
  3639  	if err != nil {
  3640  		s.tctx.L().Error("fail to find binlog position by timestamp",
  3641  			zap.Time("time", t),
  3642  			zap.Error(err))
  3643  		return err
  3644  	}
  3645  
  3646  	switch posTp {
  3647  	case binlog.InRangeBinlogPos:
  3648  		s.tctx.L().Info("find binlog position by timestamp",
  3649  			zap.String("time", timeStr),
  3650  			zap.Stringer("pos", loc))
  3651  	case binlog.BelowLowerBoundBinlogPos:
  3652  		s.tctx.L().Warn("fail to find binlog location by timestamp because the timestamp is too early, will use the earliest binlog location",
  3653  			zap.String("time", timeStr),
  3654  			zap.Any("location", loc))
  3655  	case binlog.AboveUpperBoundBinlogPos:
  3656  		return terror.ErrConfigStartTimeTooLate.Generate(timeStr)
  3657  	}
  3658  
  3659  	err = s.checkpoint.DeleteAllTablePoint(tctx)
  3660  	if err != nil {
  3661  		return err
  3662  	}
  3663  	s.checkpoint.SaveGlobalPointForcibly(*loc)
  3664  	s.tctx.L().Info("Will replicate from the specified time, the location recorded in checkpoint and config file will be ignored",
  3665  		zap.String("time", timeStr),
  3666  		zap.Any("locationOfTheTime", loc))
  3667  	return nil
  3668  }
  3669  
  3670  func (s *Syncer) getFlushedGlobalPoint() binlog.Location {
  3671  	return s.checkpoint.FlushedGlobalPoint()
  3672  }
  3673  
  3674  func (s *Syncer) getInitExecutedLoc() binlog.Location {
  3675  	s.RLock()
  3676  	defer s.RUnlock()
  3677  	return s.initExecutedLoc.Clone()
  3678  }
  3679  
  3680  func (s *Syncer) initInitExecutedLoc() {
  3681  	s.Lock()
  3682  	defer s.Unlock()
  3683  	if s.initExecutedLoc == nil {
  3684  		p := s.checkpoint.GlobalPoint()
  3685  		s.initExecutedLoc = &p
  3686  	}
  3687  }
  3688  
  3689  func (s *Syncer) getTrackedTableInfo(table *filter.Table) (*model.TableInfo, error) {
  3690  	return s.schemaTracker.GetTableInfo(table)
  3691  }
  3692  
  3693  func (s *Syncer) getDownStreamTableInfo(tctx *tcontext.Context, tableID string, originTI *model.TableInfo) (*schema.DownstreamTableInfo, error) {
  3694  	return s.schemaTracker.GetDownStreamTableInfo(tctx, tableID, originTI)
  3695  }
  3696  
  3697  func (s *Syncer) getTableInfoFromCheckpoint(table *filter.Table) *model.TableInfo {
  3698  	return s.checkpoint.GetTableInfo(table.Schema, table.Name)
  3699  }