github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/syncer/syncer.go (about) 1 // Copyright 2019 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package syncer 15 16 import ( 17 "bytes" 18 "context" 19 "fmt" 20 "math" 21 "os" 22 "path" 23 "reflect" 24 "strconv" 25 "strings" 26 "sync" 27 "time" 28 29 "github.com/go-mysql-org/go-mysql/mysql" 30 "github.com/go-mysql-org/go-mysql/replication" 31 mysql2 "github.com/go-sql-driver/mysql" 32 "github.com/pingcap/errors" 33 "github.com/pingcap/failpoint" 34 tidbddl "github.com/pingcap/tidb/pkg/ddl" 35 "github.com/pingcap/tidb/pkg/parser" 36 "github.com/pingcap/tidb/pkg/parser/ast" 37 "github.com/pingcap/tidb/pkg/parser/model" 38 "github.com/pingcap/tidb/pkg/sessionctx" 39 "github.com/pingcap/tidb/pkg/util/dbutil" 40 "github.com/pingcap/tidb/pkg/util/filter" 41 regexprrouter "github.com/pingcap/tidb/pkg/util/regexpr-router" 42 router "github.com/pingcap/tidb/pkg/util/table-router" 43 "github.com/pingcap/tiflow/dm/config" 44 "github.com/pingcap/tiflow/dm/config/dbconfig" 45 "github.com/pingcap/tiflow/dm/pb" 46 "github.com/pingcap/tiflow/dm/pkg/binlog" 47 "github.com/pingcap/tiflow/dm/pkg/binlog/event" 48 "github.com/pingcap/tiflow/dm/pkg/binlog/reader" 49 "github.com/pingcap/tiflow/dm/pkg/conn" 50 tcontext "github.com/pingcap/tiflow/dm/pkg/context" 51 fr "github.com/pingcap/tiflow/dm/pkg/func-rollback" 52 "github.com/pingcap/tiflow/dm/pkg/gtid" 53 "github.com/pingcap/tiflow/dm/pkg/ha" 54 "github.com/pingcap/tiflow/dm/pkg/log" 55 parserpkg "github.com/pingcap/tiflow/dm/pkg/parser" 56 "github.com/pingcap/tiflow/dm/pkg/schema" 57 "github.com/pingcap/tiflow/dm/pkg/shardddl/optimism" 58 "github.com/pingcap/tiflow/dm/pkg/shardddl/pessimism" 59 "github.com/pingcap/tiflow/dm/pkg/storage" 60 "github.com/pingcap/tiflow/dm/pkg/streamer" 61 "github.com/pingcap/tiflow/dm/pkg/terror" 62 "github.com/pingcap/tiflow/dm/pkg/utils" 63 "github.com/pingcap/tiflow/dm/relay" 64 "github.com/pingcap/tiflow/dm/syncer/binlogstream" 65 "github.com/pingcap/tiflow/dm/syncer/dbconn" 66 "github.com/pingcap/tiflow/dm/syncer/metrics" 67 onlineddl "github.com/pingcap/tiflow/dm/syncer/online-ddl-tools" 68 sm "github.com/pingcap/tiflow/dm/syncer/safe-mode" 69 "github.com/pingcap/tiflow/dm/syncer/shardddl" 70 "github.com/pingcap/tiflow/dm/unit" 71 bf "github.com/pingcap/tiflow/pkg/binlog-filter" 72 "github.com/pingcap/tiflow/pkg/errorutil" 73 "github.com/pingcap/tiflow/pkg/sqlmodel" 74 clientv3 "go.etcd.io/etcd/client/v3" 75 "go.uber.org/atomic" 76 "go.uber.org/zap" 77 ) 78 79 var ( 80 waitTime = 10 * time.Millisecond 81 82 // MaxDDLConnectionTimeoutMinute also used by SubTask.ExecuteDDL. 83 MaxDDLConnectionTimeoutMinute = 5 84 85 maxDMLConnectionTimeout = "5m" 86 maxDDLConnectionTimeout = fmt.Sprintf("%dm", MaxDDLConnectionTimeoutMinute) 87 88 maxDMLConnectionDuration, _ = time.ParseDuration(maxDMLConnectionTimeout) 89 90 defaultMaxPauseOrStopWaitTime = 10 * time.Second 91 92 adminQueueName = "admin queue" 93 defaultBucketCount = 8 94 ) 95 96 const ( 97 skipJobIdx = iota 98 ddlJobIdx 99 workerJobTSArrayInitSize // size = skip + ddl 100 ) 101 102 // waitXIDStatus represents the status for waiting XID event when pause/stop task. 103 type waitXIDStatus int64 104 105 const ( 106 noWait waitXIDStatus = iota 107 waiting 108 waitComplete 109 ) 110 111 // Syncer can sync your MySQL data to another MySQL database. 112 type Syncer struct { 113 sync.RWMutex 114 115 tctx *tcontext.Context // this ctx only used for logger. 116 117 // this ctx derives from a background ctx and was initialized in s.Run, it is used for some background tasks in s.Run 118 // when this ctx cancelled, syncer will shutdown all background running jobs (except the syncDML and syncDDL) and not wait transaction end. 119 runCtx *tcontext.Context 120 runCancel context.CancelFunc 121 // this ctx only used for syncDML and syncDDL and only cancelled when ungraceful stop. 122 syncCtx *tcontext.Context 123 syncCancel context.CancelFunc 124 // control all goroutines that started in S.Run 125 runWg sync.WaitGroup 126 127 cfg *config.SubTaskConfig 128 syncCfg replication.BinlogSyncerConfig 129 cliArgs *config.TaskCliArgs 130 metricsProxies *metrics.Proxies 131 132 sgk *ShardingGroupKeeper // keeper to keep all sharding (sub) group in this syncer 133 osgk *OptShardingGroupKeeper // optimistic ddl's keeper to keep all sharding (sub) group in this syncer 134 135 pessimist *shardddl.Pessimist // shard DDL pessimist 136 optimist *shardddl.Optimist // shard DDL optimist 137 cli *clientv3.Client 138 139 binlogType binlogstream.BinlogType 140 streamerController *binlogstream.StreamerController 141 142 jobWg sync.WaitGroup // counts ddl/flush/asyncFlush job in-flight in s.dmlJobCh and s.ddlJobCh 143 144 schemaTracker *schema.Tracker 145 146 fromDB *dbconn.UpStreamConn 147 fromConn *dbconn.DBConn 148 149 toDB *conn.BaseDB 150 toDBConns []*dbconn.DBConn 151 ddlDB *conn.BaseDB 152 ddlDBConn *dbconn.DBConn 153 downstreamTrackConn *dbconn.DBConn 154 155 dmlJobCh chan *job 156 ddlJobCh chan *job 157 jobsClosed atomic.Bool 158 jobsChanLock sync.Mutex 159 waitXIDJob atomic.Int64 160 isTransactionEnd bool 161 waitTransactionLock sync.Mutex 162 163 tableRouter *regexprrouter.RouteTable 164 binlogFilter *bf.BinlogEvent 165 baList *filter.Filter 166 exprFilterGroup *ExprFilterGroup 167 sessCtx sessionctx.Context 168 169 running atomic.Bool 170 closed atomic.Bool 171 172 start atomic.Time 173 lastTime atomic.Time 174 175 // safeMode is used to track if we need to generate dml with safe-mode 176 // For each binlog event, we will set the current value into eventContext because 177 // the status of this track may change over time. 178 safeMode *sm.SafeMode 179 180 upstreamTZ *time.Location 181 timezone *time.Location 182 timezoneLastTime string 183 upstreamTZStr string 184 185 binlogSizeCount atomic.Int64 186 lastBinlogSizeCount atomic.Int64 187 188 lastCount atomic.Int64 189 count atomic.Int64 190 totalRps atomic.Int64 191 rps atomic.Int64 192 193 filteredInsert atomic.Int64 194 filteredUpdate atomic.Int64 195 filteredDelete atomic.Int64 196 197 checkpoint CheckPoint 198 checkpointFlushWorker *checkpointFlushWorker 199 onlineDDL onlineddl.OnlinePlugin 200 201 // record process error rather than log.Fatal 202 runFatalChan chan *pb.ProcessError 203 // record whether error occurred when execute SQLs 204 execError atomic.Error 205 206 readerHub *streamer.ReaderHub 207 recordedActiveRelayLog bool 208 209 currentLocationMu struct { 210 sync.RWMutex 211 currentLocation binlog.Location // use to calc remain binlog size 212 } 213 214 errLocation struct { 215 sync.RWMutex 216 startLocation *binlog.Location 217 endLocation *binlog.Location 218 isQueryEvent bool 219 } 220 221 cutOverLocation atomic.Pointer[binlog.Location] 222 223 handleJobFunc func(*job) (bool, error) 224 flushSeq int64 225 226 // `lower_case_table_names` setting of upstream db 227 SourceTableNamesFlavor conn.LowerCaseTableNamesFlavor 228 229 // time difference between upstream and DM nodes: time of DM - time of upstream. 230 // we use this to calculate replication lag more accurately when clock is not synced 231 // on either upstream or DM nodes. 232 tsOffset atomic.Int64 233 // this field measures the time difference in seconds between current time of DM and 234 // the minimal timestamp of currently processing binlog events. 235 // this lag will consider time difference between upstream and DM nodes 236 secondsBehindMaster atomic.Int64 237 // stores the last job TS(binlog event timestamp) of each worker, 238 // if there's no active job, the corresponding worker's TS is reset to 0. 239 // since DML worker runs jobs in batch, the TS is the TS of the first job in the batch. 240 // We account for skipped events too, if the distance between up and downstream is long, 241 // and there's no interested binlog event in between, we can have a decreasing lag. 242 // - 0 is for skip jobs 243 // - 1 is ddl worker 244 // - 2+ is for DML worker job idx=(queue id + 2) 245 workerJobTSArray []*atomic.Int64 246 lastCheckpointFlushedTime time.Time 247 248 firstMeetBinlogTS *int64 249 exitSafeModeTS *int64 // TS(in binlog header) need to exit safe mode. 250 251 // initial executed binlog location, set once for each instance of syncer. 252 initExecutedLoc *binlog.Location 253 254 relay relay.Process 255 charsetAndDefaultCollation map[string]string 256 idAndCollationMap map[int]string 257 258 ddlWorker *DDLWorker 259 } 260 261 // NewSyncer creates a new Syncer. 262 func NewSyncer(cfg *config.SubTaskConfig, etcdClient *clientv3.Client, relay relay.Process) *Syncer { 263 logFields := []zap.Field{ 264 zap.String("task", cfg.Name), 265 zap.String("unit", "binlog replication"), 266 } 267 var logger log.Logger 268 if cfg.FrameworkLogger != nil { 269 logger = log.Logger{Logger: cfg.FrameworkLogger.With(logFields...)} 270 } else { 271 logger = log.With(logFields...) 272 } 273 274 syncer := &Syncer{ 275 pessimist: shardddl.NewPessimist(&logger, etcdClient, cfg.Name, cfg.SourceID), 276 optimist: shardddl.NewOptimist(&logger, etcdClient, cfg.Name, cfg.SourceID), 277 } 278 syncer.cfg = cfg 279 syncer.tctx = tcontext.Background().WithLogger(logger) 280 syncer.jobsClosed.Store(true) // not open yet 281 syncer.waitXIDJob.Store(int64(noWait)) 282 syncer.isTransactionEnd = true 283 syncer.closed.Store(false) 284 syncer.lastBinlogSizeCount.Store(0) 285 syncer.binlogSizeCount.Store(0) 286 syncer.lastCount.Store(0) 287 syncer.count.Store(0) 288 syncer.handleJobFunc = syncer.handleJob 289 syncer.cli = etcdClient 290 291 syncer.checkpoint = NewRemoteCheckPoint(syncer.tctx, cfg, syncer.metricsProxies, syncer.checkpointID()) 292 293 syncer.binlogType = binlogstream.RelayToBinlogType(relay) 294 syncer.readerHub = streamer.GetReaderHub() 295 296 if cfg.ShardMode == config.ShardPessimistic { 297 // only need to sync DDL in sharding mode 298 syncer.sgk = NewShardingGroupKeeper(syncer.tctx, cfg, syncer.metricsProxies) 299 } else if cfg.ShardMode == config.ShardOptimistic { 300 syncer.osgk = NewOptShardingGroupKeeper(syncer.tctx, cfg) 301 } 302 syncer.recordedActiveRelayLog = false 303 syncer.workerJobTSArray = make([]*atomic.Int64, cfg.WorkerCount+workerJobTSArrayInitSize) 304 for i := range syncer.workerJobTSArray { 305 syncer.workerJobTSArray[i] = atomic.NewInt64(0) 306 } 307 syncer.lastCheckpointFlushedTime = time.Time{} 308 syncer.relay = relay 309 syncer.safeMode = sm.NewSafeMode() 310 311 return syncer 312 } 313 314 func (s *Syncer) refreshCliArgs() { 315 if s.cli == nil { 316 // for dummy syncer in ut 317 return 318 } 319 cliArgs, err := ha.GetTaskCliArgs(s.cli, s.cfg.Name, s.cfg.SourceID) 320 if err != nil { 321 s.tctx.L().Error("failed to get task cli args", zap.Error(err)) 322 } 323 s.Lock() 324 s.cliArgs = cliArgs 325 s.Unlock() 326 } 327 328 func (s *Syncer) newJobChans() { 329 chanSize := calculateChanSize(s.cfg.QueueSize, s.cfg.WorkerCount, s.cfg.Compact) 330 s.dmlJobCh = make(chan *job, chanSize) 331 s.ddlJobCh = make(chan *job, s.cfg.QueueSize) 332 s.jobsClosed.Store(false) 333 } 334 335 func (s *Syncer) closeJobChans() { 336 s.jobsChanLock.Lock() 337 defer s.jobsChanLock.Unlock() 338 if s.jobsClosed.Load() { 339 return 340 } 341 close(s.dmlJobCh) 342 close(s.ddlJobCh) 343 s.jobsClosed.Store(true) 344 } 345 346 // Type implements Unit.Type. 347 func (s *Syncer) Type() pb.UnitType { 348 return pb.UnitType_Sync 349 } 350 351 // Init initializes syncer for a sync task, but not start Process. 352 // if fail, it should not call s.Close. 353 // some check may move to checker later. 354 func (s *Syncer) Init(ctx context.Context) (err error) { 355 failpoint.Inject("IOTotalBytes", func(val failpoint.Value) { 356 c := atomic.NewUint64(0) 357 s.cfg.UUID = val.(string) 358 s.cfg.IOTotalBytes = c 359 360 go func() { 361 for { 362 s.tctx.L().Debug("IOTotalBytes", zap.Uint64("IOTotalBytes", s.cfg.IOTotalBytes.Load())) 363 time.Sleep(10 * time.Second) 364 } 365 }() 366 }) 367 rollbackHolder := fr.NewRollbackHolder("syncer") 368 defer func() { 369 if err != nil { 370 rollbackHolder.RollbackReverseOrder() 371 } 372 }() 373 374 tctx := s.tctx.WithContext(ctx) 375 s.upstreamTZ, s.upstreamTZStr, err = str2TimezoneOrFromDB(tctx, "", conn.UpstreamDBConfig(&s.cfg.From)) 376 if err != nil { 377 return 378 } 379 s.timezone, _, err = str2TimezoneOrFromDB(tctx, s.cfg.Timezone, conn.DownstreamDBConfig(&s.cfg.To)) 380 if err != nil { 381 return 382 } 383 384 s.baList, err = filter.New(s.cfg.CaseSensitive, s.cfg.BAList) 385 if err != nil { 386 return terror.ErrSyncerUnitGenBAList.Delegate(err) 387 } 388 389 s.syncCfg, err = subtaskCfg2BinlogSyncerCfg(s.cfg, s.timezone, s.baList) 390 if err != nil { 391 return err 392 } 393 394 err = s.createDBs(ctx) 395 if err != nil { 396 return err 397 } 398 rollbackHolder.Add(fr.FuncRollback{Name: "close-DBs", Fn: s.closeDBs}) 399 400 if s.cfg.CollationCompatible == config.StrictCollationCompatible { 401 s.charsetAndDefaultCollation, s.idAndCollationMap, err = dbconn.GetCharsetAndCollationInfo(tctx, s.fromConn) 402 if err != nil { 403 return err 404 } 405 } 406 407 s.streamerController = binlogstream.NewStreamerController( 408 s.syncCfg, 409 s.cfg.EnableGTID, 410 s.fromDB, 411 s.cfg.RelayDir, 412 s.timezone, 413 s.relay, 414 s.tctx.L(), 415 ) 416 417 s.binlogFilter, err = bf.NewBinlogEvent(s.cfg.CaseSensitive, s.cfg.FilterRules) 418 if err != nil { 419 return terror.ErrSyncerUnitGenBinlogEventFilter.Delegate(err) 420 } 421 422 vars := map[string]string{ 423 "time_zone": s.timezone.String(), 424 } 425 s.sessCtx = utils.NewSessionCtx(vars) 426 s.exprFilterGroup = NewExprFilterGroup(s.tctx, s.sessCtx, s.cfg.ExprFilter) 427 // create an empty Tracker and will be initialized in `Run` 428 s.schemaTracker = schema.NewTracker() 429 430 if s.cfg.OnlineDDL { 431 s.onlineDDL, err = onlineddl.NewRealOnlinePlugin(tctx, s.cfg, s.metricsProxies) 432 if err != nil { 433 return err 434 } 435 rollbackHolder.Add(fr.FuncRollback{Name: "close-onlineDDL", Fn: s.closeOnlineDDL}) 436 } 437 438 err = s.genRouter() 439 if err != nil { 440 return err 441 } 442 443 var schemaMap map[string]string 444 var tableMap map[string]map[string]string 445 if s.SourceTableNamesFlavor == conn.LCTableNamesSensitive { 446 // TODO: we should avoid call this function multi times 447 allTables, err1 := conn.FetchAllDoTables(ctx, s.fromDB.BaseDB, s.baList) 448 if err1 != nil { 449 return err1 450 } 451 schemaMap, tableMap = buildLowerCaseTableNamesMap(s.tctx.L(), allTables) 452 } 453 454 switch s.cfg.ShardMode { 455 case config.ShardPessimistic: 456 err = s.sgk.Init() 457 if err != nil { 458 return err 459 } 460 err = s.initShardingGroups(ctx, true) 461 if err != nil { 462 return err 463 } 464 rollbackHolder.Add(fr.FuncRollback{Name: "close-sharding-group-keeper", Fn: s.sgk.Close}) 465 case config.ShardOptimistic: 466 if err = s.initOptimisticShardDDL(ctx); err != nil { 467 return err 468 } 469 } 470 471 err = s.checkpoint.Init(tctx) 472 if err != nil { 473 return err 474 } 475 476 rollbackHolder.Add(fr.FuncRollback{Name: "close-checkpoint", Fn: s.checkpoint.Close}) 477 478 err = s.checkpoint.Load(tctx) 479 if err != nil { 480 return err 481 } 482 if s.SourceTableNamesFlavor == conn.LCTableNamesSensitive { 483 if err = s.checkpoint.CheckAndUpdate(ctx, schemaMap, tableMap); err != nil { 484 return err 485 } 486 487 if s.onlineDDL != nil { 488 if err = s.onlineDDL.CheckAndUpdate(s.tctx, schemaMap, tableMap); err != nil { 489 return err 490 } 491 } 492 } 493 494 // when Init syncer, set active relay log info 495 if s.cfg.Meta == nil || s.cfg.Meta.BinLogName != binlog.FakeBinlogName { 496 err = s.setInitActiveRelayLog(ctx) 497 if err != nil { 498 return err 499 } 500 rollbackHolder.Add(fr.FuncRollback{Name: "remove-active-realylog", Fn: s.removeActiveRelayLog}) 501 } 502 s.reset() 503 504 metricProxies := metrics.DefaultMetricsProxies 505 if s.cfg.MetricsFactory != nil { 506 metricProxies = &metrics.Proxies{} 507 metricProxies.Init(s.cfg.MetricsFactory) 508 } 509 s.metricsProxies = metricProxies.CacheForOneTask(s.cfg.Name, s.cfg.WorkerName, s.cfg.SourceID) 510 511 s.ddlWorker = NewDDLWorker(&s.tctx.Logger, s) 512 return nil 513 } 514 515 // buildLowerCaseTableNamesMap build a lower case schema map and lower case table map for all tables 516 // Input: map of schema --> list of tables 517 // Output: schema names map: lower_case_schema_name --> schema_name 518 // 519 // tables names map: lower_case_schema_name --> lower_case_table_name --> table_name 520 // 521 // Note: the result will skip the schemas and tables that their lower_case_name are the same. 522 func buildLowerCaseTableNamesMap(logger log.Logger, tables map[string][]string) (map[string]string, map[string]map[string]string) { 523 schemaMap := make(map[string]string) 524 tablesMap := make(map[string]map[string]string) 525 lowerCaseSchemaSet := make(map[string]string) 526 for schema, tableNames := range tables { 527 lcSchema := strings.ToLower(schema) 528 // track if there are multiple schema names with the same lower case name. 529 // just skip this kind of schemas. 530 if rawSchema, ok := lowerCaseSchemaSet[lcSchema]; ok { 531 delete(schemaMap, lcSchema) 532 delete(tablesMap, lcSchema) 533 logger.Warn("skip check schema with same lower case value", 534 zap.Strings("schemas", []string{schema, rawSchema})) 535 continue 536 } 537 lowerCaseSchemaSet[lcSchema] = schema 538 539 if lcSchema != schema { 540 schemaMap[lcSchema] = schema 541 } 542 tblsMap := make(map[string]string) 543 lowerCaseTableSet := make(map[string]string) 544 for _, tb := range tableNames { 545 lcTbl := strings.ToLower(tb) 546 if rawTbl, ok := lowerCaseTableSet[lcTbl]; ok { 547 delete(tblsMap, lcTbl) 548 logger.Warn("skip check tables with same lower case value", zap.String("schema", schema), 549 zap.Strings("table", []string{tb, rawTbl})) 550 continue 551 } 552 if lcTbl != tb { 553 tblsMap[lcTbl] = tb 554 } 555 } 556 if len(tblsMap) > 0 { 557 tablesMap[lcSchema] = tblsMap 558 } 559 } 560 return schemaMap, tablesMap 561 } 562 563 // initShardingGroups initializes sharding groups according to source MySQL, filter rules and router rules 564 // NOTE: now we don't support modify router rules after task has started. 565 func (s *Syncer) initShardingGroups(ctx context.Context, needCheck bool) error { 566 // fetch tables from source and filter them 567 sourceTables, err := s.fromDB.FetchAllDoTables(ctx, s.baList) 568 if err != nil { 569 return err 570 } 571 572 // convert according to router rules 573 // target-ID -> source-IDs 574 mapper := make(map[string][]string, len(sourceTables)) 575 for schema, tables := range sourceTables { 576 for _, table := range tables { 577 sourceTable := &filter.Table{Schema: schema, Name: table} 578 targetTable := s.route(sourceTable) 579 targetID := utils.GenTableID(targetTable) 580 sourceID := utils.GenTableID(sourceTable) 581 _, ok := mapper[targetID] 582 if !ok { 583 mapper[targetID] = make([]string, 0, len(tables)) 584 } 585 mapper[targetID] = append(mapper[targetID], sourceID) 586 } 587 } 588 589 loadMeta, err2 := s.sgk.LoadShardMeta(s.cfg.Flavor, s.cfg.EnableGTID) 590 if err2 != nil { 591 return err2 592 } 593 if needCheck && s.SourceTableNamesFlavor == conn.LCTableNamesSensitive { 594 // try fix persistent data before init 595 schemaMap, tableMap := buildLowerCaseTableNamesMap(s.tctx.L(), sourceTables) 596 if err2 = s.sgk.CheckAndFix(loadMeta, schemaMap, tableMap); err2 != nil { 597 return err2 598 } 599 } 600 601 // add sharding group 602 for targetID, sourceIDs := range mapper { 603 targetTable := utils.UnpackTableID(targetID) 604 _, _, _, _, err := s.sgk.AddGroup(targetTable, sourceIDs, loadMeta[targetID], false) 605 if err != nil { 606 return err 607 } 608 } 609 610 shardGroup := s.sgk.Groups() 611 s.tctx.L().Debug("initial sharding groups", zap.Int("shard group length", len(shardGroup)), zap.Reflect("shard group", shardGroup)) 612 613 return nil 614 } 615 616 // IsFreshTask implements Unit.IsFreshTask. 617 func (s *Syncer) IsFreshTask(ctx context.Context) (bool, error) { 618 globalPoint := s.checkpoint.GlobalPoint() 619 tablePoint := s.checkpoint.TablePoint() 620 // doesn't have neither GTID nor binlog pos 621 return binlog.IsFreshPosition(globalPoint, s.cfg.Flavor, s.cfg.EnableGTID) && len(tablePoint) == 0, nil 622 } 623 624 func (s *Syncer) reset() { 625 if s.streamerController != nil { 626 s.streamerController.Close() 627 } 628 s.secondsBehindMaster.Store(0) 629 for _, jobTS := range s.workerJobTSArray { 630 jobTS.Store(0) 631 } 632 // create new job chans 633 s.newJobChans() 634 s.checkpoint.DiscardPendingSnapshots() 635 s.checkpointFlushWorker = &checkpointFlushWorker{ 636 input: make(chan *checkpointFlushTask, 16), 637 cp: s.checkpoint, 638 execError: &s.execError, 639 afterFlushFn: s.afterFlushCheckpoint, 640 updateJobMetricsFn: s.updateJobMetrics, 641 } 642 643 s.execError.Store(nil) 644 s.setErrLocation(nil, nil, false) 645 s.waitXIDJob.Store(int64(noWait)) 646 s.isTransactionEnd = true 647 s.flushSeq = 0 648 s.firstMeetBinlogTS = nil 649 s.exitSafeModeTS = nil 650 switch s.cfg.ShardMode { 651 case config.ShardPessimistic: 652 // every time start to re-sync from resume, we reset status to make it like a fresh syncing 653 s.sgk.ResetGroups() 654 s.pessimist.Reset() 655 case config.ShardOptimistic: 656 s.osgk.Reset() 657 s.optimist.Reset() 658 } 659 } 660 661 func (s *Syncer) resetDBs(tctx *tcontext.Context) error { 662 var err error 663 664 for i := 0; i < len(s.toDBConns); i++ { 665 err = s.toDBConns[i].ResetConn(tctx) 666 if err != nil { 667 return terror.WithScope(err, terror.ScopeDownstream) 668 } 669 } 670 671 if s.onlineDDL != nil { 672 err = s.onlineDDL.ResetConn(tctx) 673 if err != nil { 674 return terror.WithScope(err, terror.ScopeDownstream) 675 } 676 } 677 678 if s.sgk != nil { 679 err = s.sgk.dbConn.ResetConn(tctx) 680 if err != nil { 681 return terror.WithScope(err, terror.ScopeDownstream) 682 } 683 } 684 685 err = s.ddlDBConn.ResetConn(tctx) 686 if err != nil { 687 return terror.WithScope(err, terror.ScopeDownstream) 688 } 689 690 err = s.downstreamTrackConn.ResetConn(tctx) 691 if err != nil { 692 return terror.WithScope(err, terror.ScopeDownstream) 693 } 694 695 err = s.checkpoint.ResetConn(tctx) 696 if err != nil { 697 return terror.WithScope(err, terror.ScopeDownstream) 698 } 699 700 return nil 701 } 702 703 func (s *Syncer) handleExitErrMetric(err *pb.ProcessError) { 704 if unit.IsResumableError(err) { 705 s.metricsProxies.Metrics.ExitWithResumableErrorCounter.Inc() 706 } else { 707 s.metricsProxies.Metrics.ExitWithNonResumableErrorCounter.Inc() 708 } 709 } 710 711 // Process implements the dm.Unit interface. 712 func (s *Syncer) Process(ctx context.Context, pr chan pb.ProcessResult) { 713 s.metricsProxies.Metrics.ExitWithResumableErrorCounter.Add(0) 714 s.metricsProxies.Metrics.ExitWithNonResumableErrorCounter.Add(0) 715 716 newCtx, cancel := context.WithCancel(ctx) 717 defer cancel() 718 719 // create new done chan 720 // use lock of Syncer to avoid Close while Process 721 s.Lock() 722 if s.isClosed() { 723 s.Unlock() 724 return 725 } 726 s.Unlock() 727 728 runFatalChan := make(chan *pb.ProcessError, s.cfg.WorkerCount+1) 729 s.runFatalChan = runFatalChan 730 var ( 731 errs = make([]*pb.ProcessError, 0, 2) 732 errsMu sync.Mutex 733 ) 734 735 var wg sync.WaitGroup 736 wg.Add(1) 737 go func() { 738 defer wg.Done() 739 for { 740 err, ok := <-runFatalChan 741 if !ok { 742 return 743 } 744 cancel() // cancel s.Run 745 errsMu.Lock() 746 errs = append(errs, err) 747 errsMu.Unlock() 748 } 749 }() 750 751 wg.Add(1) 752 go func() { 753 defer wg.Done() 754 <-newCtx.Done() // ctx or newCtx 755 }() 756 757 err := s.Run(newCtx) 758 if err != nil { 759 // returned error rather than sent to runFatalChan 760 // cancel goroutines created in s.Run 761 cancel() 762 } 763 close(runFatalChan) // Run returned, all potential fatal sent to s.runFatalChan 764 wg.Wait() // wait for receive all fatal from s.runFatalChan 765 766 if err != nil { 767 if utils.IsContextCanceledError(err) { 768 s.tctx.L().Info("filter out error caused by user cancel", log.ShortError(err)) 769 } else { 770 s.tctx.L().Debug("unit syncer quits with error", zap.Error(err)) 771 errsMu.Lock() 772 errs = append(errs, unit.NewProcessError(err)) 773 errsMu.Unlock() 774 } 775 } 776 777 isCanceled := false 778 select { 779 case <-ctx.Done(): 780 isCanceled = true 781 default: 782 } 783 784 for _, processError := range errs { 785 s.handleExitErrMetric(processError) 786 } 787 pr <- pb.ProcessResult{ 788 IsCanceled: isCanceled, 789 Errors: errs, 790 } 791 } 792 793 // getTableInfo returns a table info for sourceTable, it should not be modified by caller. 794 func (s *Syncer) getTableInfo(tctx *tcontext.Context, sourceTable, targetTable *filter.Table) (*model.TableInfo, error) { 795 if s.schemaTracker == nil { 796 return nil, terror.ErrSchemaTrackerIsClosed.New("schema tracker not init") 797 } 798 ti, err := s.schemaTracker.GetTableInfo(sourceTable) 799 if err == nil { 800 return ti, nil 801 } 802 if !schema.IsTableNotExists(err) { 803 return nil, terror.ErrSchemaTrackerCannotGetTable.Delegate(err, sourceTable) 804 } 805 806 if err = s.schemaTracker.CreateSchemaIfNotExists(sourceTable.Schema); err != nil { 807 return nil, terror.ErrSchemaTrackerCannotCreateSchema.Delegate(err, sourceTable.Schema) 808 } 809 810 // if the table does not exist (IsTableNotExists(err)), continue to fetch the table from downstream and create it. 811 err = s.trackTableInfoFromDownstream(tctx, sourceTable, targetTable) 812 if err != nil { 813 return nil, err 814 } 815 816 ti, err = s.schemaTracker.GetTableInfo(sourceTable) 817 if err != nil { 818 return nil, terror.ErrSchemaTrackerCannotGetTable.Delegate(err, sourceTable) 819 } 820 return ti, nil 821 } 822 823 // getDBInfoFromDownstream tries to track the db info from the downstream. It will not overwrite existing table. 824 func (s *Syncer) getDBInfoFromDownstream(tctx *tcontext.Context, sourceTable, targetTable *filter.Table) (*model.DBInfo, error) { 825 // TODO: Switch to use the HTTP interface to retrieve the TableInfo directly if HTTP port is available 826 // use parser for downstream. 827 parser2, err := dbconn.GetParserForConn(tctx, s.ddlDBConn) 828 if err != nil { 829 return nil, terror.ErrSchemaTrackerCannotParseDownstreamTable.Delegate(err, targetTable, sourceTable) 830 } 831 832 createSQL, err := dbconn.GetSchemaCreateSQL(tctx, s.ddlDBConn, targetTable.Schema) 833 if err != nil { 834 return nil, terror.ErrSchemaTrackerCannotFetchDownstreamTable.Delegate(err, targetTable, sourceTable) 835 } 836 837 createNode, err := parser2.ParseOneStmt(createSQL, "", "") 838 if err != nil { 839 return nil, terror.ErrSchemaTrackerCannotParseDownstreamTable.Delegate(err, targetTable, sourceTable) 840 } 841 stmt := createNode.(*ast.CreateDatabaseStmt) 842 843 // we only consider explicit charset/collate, if not found, fallback to default charset/collate. 844 charsetOpt := ast.CharsetOpt{} 845 for _, val := range stmt.Options { 846 switch val.Tp { 847 case ast.DatabaseOptionCharset: 848 charsetOpt.Chs = val.Value 849 case ast.DatabaseOptionCollate: 850 charsetOpt.Col = val.Value 851 } 852 } 853 854 chs, coll, err := tidbddl.ResolveCharsetCollation(nil, charsetOpt) 855 if err != nil { 856 return nil, errors.Trace(err) 857 } 858 859 return &model.DBInfo{Name: stmt.Name, Charset: chs, Collate: coll}, nil 860 } 861 862 // trackTableInfoFromDownstream tries to track the table info from the downstream. It will not overwrite existing table. 863 func (s *Syncer) trackTableInfoFromDownstream(tctx *tcontext.Context, sourceTable, targetTable *filter.Table) error { 864 // TODO: Switch to use the HTTP interface to retrieve the TableInfo directly if HTTP port is available 865 // use parser for downstream. 866 parser2, err := dbconn.GetParserForConn(tctx, s.ddlDBConn) 867 if err != nil { 868 return terror.ErrSchemaTrackerCannotParseDownstreamTable.Delegate(err, targetTable, sourceTable) 869 } 870 871 createSQL, err := dbconn.GetTableCreateSQL(tctx, s.ddlDBConn, targetTable.String()) 872 if err != nil { 873 return terror.ErrSchemaTrackerCannotFetchDownstreamTable.Delegate(err, targetTable, sourceTable) 874 } 875 876 // rename the table back to original. 877 var createNode ast.StmtNode 878 createNode, err = parser2.ParseOneStmt(createSQL, "", "") 879 if err != nil { 880 return terror.ErrSchemaTrackerCannotParseDownstreamTable.Delegate(err, targetTable, sourceTable) 881 } 882 createStmt := createNode.(*ast.CreateTableStmt) 883 createStmt.IfNotExists = true 884 createStmt.Table.Schema = model.NewCIStr(sourceTable.Schema) 885 createStmt.Table.Name = model.NewCIStr(sourceTable.Name) 886 887 // schema tracker sets non-clustered index, so can't handle auto_random. 888 for _, col := range createStmt.Cols { 889 for i, opt := range col.Options { 890 if opt.Tp == ast.ColumnOptionAutoRandom { 891 // col.Options is unordered 892 col.Options[i] = col.Options[len(col.Options)-1] 893 col.Options = col.Options[:len(col.Options)-1] 894 break 895 } 896 } 897 } 898 899 tctx.L().Debug("reverse-synchronized table schema", 900 zap.Stringer("sourceTable", sourceTable), 901 zap.Stringer("targetTable", targetTable), 902 zap.String("sql", createSQL), 903 ) 904 if err = s.schemaTracker.Exec(tctx.Ctx, sourceTable.Schema, createStmt); err != nil { 905 return terror.ErrSchemaTrackerCannotCreateTable.Delegate(err, sourceTable) 906 } 907 908 return nil 909 } 910 911 var dmlMetric = map[sqlmodel.RowChangeType]string{ 912 sqlmodel.RowChangeInsert: "insert", 913 sqlmodel.RowChangeUpdate: "update", 914 sqlmodel.RowChangeDelete: "delete", 915 } 916 917 func (s *Syncer) updateJobMetrics(isFinished bool, queueBucket string, j *job) { 918 tp := j.tp 919 targetTable := j.targetTable 920 count := 1 921 if tp == ddl { 922 count = len(j.ddls) 923 } 924 925 m := s.metricsProxies.AddedJobsTotal 926 if isFinished { 927 s.count.Add(int64(count)) 928 m = s.metricsProxies.FinishedJobsTotal 929 } 930 switch tp { 931 case dml: 932 m.WithLabelValues(dmlMetric[j.dml.Type()], s.cfg.Name, queueBucket, s.cfg.SourceID, s.cfg.WorkerName, targetTable.Schema, targetTable.Name).Add(float64(count)) 933 case ddl, flush, asyncFlush, conflict, compact: 934 m.WithLabelValues(tp.String(), s.cfg.Name, queueBucket, s.cfg.SourceID, s.cfg.WorkerName, targetTable.Schema, targetTable.Name).Add(float64(count)) 935 case skip, xid: 936 // ignore skip/xid jobs 937 default: 938 s.tctx.L().Warn("unknown job operation type", zap.Stringer("type", j.tp)) 939 } 940 } 941 942 func (s *Syncer) calcReplicationLag(headerTS int64) int64 { 943 return time.Now().Unix() - s.tsOffset.Load() - headerTS 944 } 945 946 // updateReplicationJobTS store job TS, it is called after every batch dml job / one skip job / one ddl job is added and committed. 947 func (s *Syncer) updateReplicationJobTS(job *job, jobIdx int) { 948 // when job is nil mean no job in this bucket, need to reset this bucket job ts to 0 949 if job == nil { 950 s.workerJobTSArray[jobIdx].Store(0) 951 } else { 952 s.workerJobTSArray[jobIdx].Store(int64(job.eventHeader.Timestamp)) 953 } 954 } 955 956 func (s *Syncer) updateReplicationLagMetric() { 957 var lag int64 958 var minTS int64 959 960 for idx := range s.workerJobTSArray { 961 if ts := s.workerJobTSArray[idx].Load(); ts != int64(0) { 962 if minTS == int64(0) || ts < minTS { 963 minTS = ts 964 } 965 } 966 } 967 if minTS != int64(0) { 968 lag = s.calcReplicationLag(minTS) 969 } 970 971 s.metricsProxies.Metrics.ReplicationLagHistogram.Observe(float64(lag)) 972 s.metricsProxies.Metrics.ReplicationLagGauge.Set(float64(lag)) 973 s.secondsBehindMaster.Store(lag) 974 975 failpoint.Inject("ShowLagInLog", func(v failpoint.Value) { 976 minLag := v.(int) 977 if int(lag) >= minLag { 978 s.tctx.L().Info("ShowLagInLog", zap.Int64("lag", lag)) 979 } 980 }) 981 982 // reset skip job TS in case of skip job TS is never updated 983 if minTS == s.workerJobTSArray[skipJobIdx].Load() { 984 s.workerJobTSArray[skipJobIdx].Store(0) 985 } 986 } 987 988 func (s *Syncer) saveTablePoint(table *filter.Table, location binlog.Location) { 989 ti, err := s.schemaTracker.GetTableInfo(table) 990 if err != nil && table.Name != "" { 991 // TODO: if we RENAME tb1 TO tb2, the tracker will remove TableInfo of tb1 but we still save the table 992 // checkpoint for tb1. We can delete the table checkpoint in future. 993 s.tctx.L().Warn("table info missing from schema tracker", 994 zap.Stringer("table", table), 995 zap.Stringer("location", location), 996 zap.Error(err)) 997 } 998 s.checkpoint.SaveTablePoint(table, location, ti) 999 } 1000 1001 // only used in tests. 1002 var ( 1003 lastLocationForTest binlog.Location 1004 lastLocationNumForTest int 1005 waitJobsDoneForTest bool 1006 failExecuteSQLForTest bool 1007 failOnceForTest atomic.Bool 1008 waitBeforeRunExitDurationForTest time.Duration 1009 ) 1010 1011 // TODO: move to syncer/job.go 1012 // addJob adds one job to DML queue or DDL queue according to its type. 1013 // Caller should prepare all needed jobs before calling this function, addJob should not generate any new jobs. 1014 // There should not be a second way to send jobs to DML queue or DDL queue. 1015 func (s *Syncer) addJob(job *job) { 1016 failpoint.Inject("countJobFromOneEvent", func() { 1017 if job.tp == dml { 1018 if job.currentLocation.Position.Compare(lastLocationForTest.Position) == 0 { 1019 lastLocationNumForTest++ 1020 } else { 1021 lastLocationForTest = job.currentLocation 1022 lastLocationNumForTest = 1 1023 } 1024 // trigger a flush after see one job 1025 if lastLocationNumForTest == 1 { 1026 waitJobsDoneForTest = true 1027 s.tctx.L().Info("meet the first job of an event", zap.Any("binlog position", lastLocationForTest)) 1028 } 1029 // mock a execution error after see two jobs. 1030 if lastLocationNumForTest == 2 { 1031 failExecuteSQLForTest = true 1032 s.tctx.L().Info("meet the second job of an event", zap.Any("binlog position", lastLocationForTest)) 1033 } 1034 } 1035 }) 1036 failpoint.Inject("countJobFromOneGTID", func() { 1037 if job.tp == dml { 1038 if binlog.CompareLocation(job.currentLocation, lastLocationForTest, true) == 0 { 1039 lastLocationNumForTest++ 1040 } else { 1041 lastLocationForTest = job.currentLocation 1042 lastLocationNumForTest = 1 1043 } 1044 // trigger a flush after see one job 1045 if lastLocationNumForTest == 1 { 1046 waitJobsDoneForTest = true 1047 s.tctx.L().Info("meet the first job of a GTID", zap.Any("binlog position", lastLocationForTest)) 1048 } 1049 // mock a execution error after see two jobs. 1050 if lastLocationNumForTest == 2 { 1051 failExecuteSQLForTest = true 1052 s.tctx.L().Info("meet the second job of a GTID", zap.Any("binlog position", lastLocationForTest)) 1053 } 1054 } 1055 }) 1056 1057 // avoid job.type data race with compactor.run() 1058 // simply copy the opType for performance, though copy a new job in compactor is better 1059 tp := job.tp 1060 switch tp { 1061 case flush: 1062 s.jobWg.Add(1) 1063 s.dmlJobCh <- job 1064 case asyncFlush: 1065 s.jobWg.Add(1) 1066 s.dmlJobCh <- job 1067 case ddl: 1068 s.updateJobMetrics(false, adminQueueName, job) 1069 s.jobWg.Add(1) 1070 startTime := time.Now() 1071 s.ddlJobCh <- job 1072 s.metricsProxies.AddJobDurationHistogram.WithLabelValues("ddl", s.cfg.Name, adminQueueName, s.cfg.SourceID).Observe(time.Since(startTime).Seconds()) 1073 case dml: 1074 failpoint.Inject("SkipDML", func(val failpoint.Value) { 1075 // first col should be an int and primary key, every row with pk <= val will be skipped 1076 skippedIDUpperBound := val.(int) 1077 firstColVal, _ := strconv.Atoi(fmt.Sprintf("%v", job.dml.RowValues()[0])) 1078 if firstColVal <= skippedIDUpperBound { 1079 failpoint.Goto("skip_dml") 1080 } 1081 }) 1082 s.dmlJobCh <- job 1083 failpoint.Label("skip_dml") 1084 failpoint.Inject("checkCheckpointInMiddleOfTransaction", func() { 1085 s.tctx.L().Info("receive dml job", zap.Any("dml job", job)) 1086 time.Sleep(500 * time.Millisecond) 1087 }) 1088 case gc: 1089 s.dmlJobCh <- job 1090 default: 1091 s.tctx.L().DPanic("unhandled job type", zap.Stringer("job", job)) 1092 } 1093 } 1094 1095 // flushIfOutdated checks whether syncer should flush now because last flushing is outdated. 1096 func (s *Syncer) flushIfOutdated() error { 1097 if !s.checkpoint.LastFlushOutdated() { 1098 return nil 1099 } 1100 1101 if s.cfg.Experimental.AsyncCheckpointFlush { 1102 jobSeq := s.getFlushSeq() 1103 s.tctx.L().Info("Start to async flush current checkpoint to downstream based on flush interval", zap.Int64("job sequence", jobSeq)) 1104 j := newAsyncFlushJob(s.cfg.WorkerCount, jobSeq) 1105 s.addJob(j) 1106 s.flushCheckPointsAsync(j) 1107 return nil 1108 } 1109 return s.flushJobs() 1110 } 1111 1112 // TODO: move to syncer/job.go 1113 // handleJob will do many actions based on job type. 1114 func (s *Syncer) handleJob(job *job) (added2Queue bool, err error) { 1115 skipCheckFlush := false 1116 defer func() { 1117 if !skipCheckFlush && err == nil { 1118 if cutoverLocation := s.cutOverLocation.Load(); cutoverLocation != nil && binlog.CompareLocation(*cutoverLocation, job.currentLocation, s.cfg.EnableGTID) <= 0 { 1119 err = s.flushJobs() 1120 s.cutOverLocation.Store(nil) 1121 } else { 1122 err = s.flushIfOutdated() 1123 } 1124 } 1125 }() 1126 1127 // 1. handle jobs that not needed to be sent to queue 1128 1129 s.waitTransactionLock.Lock() 1130 defer s.waitTransactionLock.Unlock() 1131 1132 failpoint.Inject("checkCheckpointInMiddleOfTransaction", func() { 1133 if waitXIDStatus(s.waitXIDJob.Load()) == waiting { 1134 s.tctx.L().Info("not receive xid job yet", zap.Any("next job", job)) 1135 } 1136 }) 1137 1138 if waitXIDStatus(s.waitXIDJob.Load()) == waitComplete && job.tp != flush { 1139 s.tctx.L().Info("All jobs is completed before syncer close, the coming job will be reject", zap.Any("job", job)) 1140 return 1141 } 1142 1143 switch job.tp { 1144 case xid: 1145 failpoint.Inject("SkipSaveGlobalPoint", func() { 1146 s.tctx.L().Info("skip save global point", zap.String("failpoint", "SkipSaveGlobalPoint")) 1147 panic("SkipSaveGlobalPoint") 1148 }) 1149 s.waitXIDJob.CAS(int64(waiting), int64(waitComplete)) 1150 s.saveGlobalPoint(job.location) 1151 s.isTransactionEnd = true 1152 return 1153 case skip: 1154 if job.eventHeader.EventType == replication.QUERY_EVENT { 1155 // skipped ddl includes: 1156 // - ddls that dm don't handle, such as analyze table(can be parsed), create function(cannot be parsed) 1157 // - ddls related to db/table which is filtered 1158 // for those ddls we record its location, so checkpoint can match master position if skipped ddl 1159 // is the last binlog in source db 1160 s.saveGlobalPoint(job.location) 1161 } 1162 s.updateReplicationJobTS(job, skipJobIdx) 1163 return 1164 } 1165 1166 // 2. send the job to queue 1167 1168 s.addJob(job) 1169 added2Queue = true 1170 1171 // 3. after job is sent to queue 1172 1173 switch job.tp { 1174 case dml: 1175 // DMl events will generate many jobs and only caller knows all jobs has been sent, so many logics are done by 1176 // caller 1177 s.isTransactionEnd = false 1178 skipCheckFlush = true 1179 return 1180 case ddl: 1181 s.jobWg.Wait() 1182 1183 // skip rest logic when downstream error 1184 if s.execError.Load() != nil { 1185 // nolint:nilerr 1186 return 1187 } 1188 s.updateReplicationJobTS(nil, ddlJobIdx) // clear ddl job ts because this ddl is already done. 1189 failpoint.Inject("ExitAfterDDLBeforeFlush", func() { 1190 s.tctx.L().Warn("exit triggered", zap.String("failpoint", "ExitAfterDDLBeforeFlush")) 1191 utils.OsExit(1) 1192 }) 1193 // interrupted after executed DDL and before save checkpoint. 1194 failpoint.Inject("FlushCheckpointStage", func(val failpoint.Value) { 1195 err = handleFlushCheckpointStage(3, val.(int), "before save checkpoint") 1196 if err != nil { 1197 failpoint.Return() 1198 } 1199 }) 1200 // save global checkpoint for DDL 1201 s.saveGlobalPoint(job.location) 1202 for sourceSchema, tbs := range job.sourceTbls { 1203 if len(sourceSchema) == 0 { 1204 continue 1205 } 1206 for _, sourceTable := range tbs { 1207 s.saveTablePoint(sourceTable, job.location) 1208 } 1209 } 1210 // reset sharding group after checkpoint saved 1211 s.resetShardingGroup(job.targetTable) 1212 1213 // interrupted after save checkpoint and before flush checkpoint. 1214 failpoint.Inject("FlushCheckpointStage", func(val failpoint.Value) { 1215 err = handleFlushCheckpointStage(4, val.(int), "before flush checkpoint") 1216 if err != nil { 1217 failpoint.Return() 1218 } 1219 }) 1220 skipCheckFlush = true 1221 err = s.flushCheckPoints() 1222 return 1223 case flush: 1224 s.jobWg.Wait() 1225 skipCheckFlush = true 1226 err = s.flushCheckPoints() 1227 return 1228 case asyncFlush: 1229 skipCheckFlush = true 1230 } 1231 // nolint:nakedret 1232 return 1233 } 1234 1235 func (s *Syncer) saveGlobalPoint(globalLocation binlog.Location) { 1236 if s.cfg.ShardMode == config.ShardPessimistic { 1237 globalLocation = s.sgk.AdjustGlobalLocation(globalLocation) 1238 } else if s.cfg.ShardMode == config.ShardOptimistic { 1239 globalLocation = s.osgk.AdjustGlobalLocation(globalLocation) 1240 } 1241 s.checkpoint.SaveGlobalPoint(globalLocation) 1242 } 1243 1244 func (s *Syncer) resetShardingGroup(table *filter.Table) { 1245 if s.cfg.ShardMode == config.ShardPessimistic { 1246 // for DDL sharding group, reset group after checkpoint saved 1247 group := s.sgk.Group(table) 1248 if group != nil { 1249 group.Reset() 1250 } 1251 } 1252 } 1253 1254 // flushCheckPoints synchronously flushes previous saved checkpoint in memory to persistent storage, like TiDB 1255 // we flush checkpoints in four cases: 1256 // 1. DDL executed 1257 // 2. pausing / stopping the sync (driven by `s.flushJobs`) 1258 // 3. IsFreshTask return true 1259 // 4. Heartbeat event received 1260 // 1261 // but when error occurred, we can not flush checkpoint, otherwise data may lost 1262 // and except rejecting to flush the checkpoint, we also need to rollback the checkpoint saved before 1263 // this should be handled when `s.Run` returned 1264 // 1265 // we may need to refactor the concurrency model to make the work-flow more clear later. 1266 func (s *Syncer) flushCheckPoints() error { 1267 err := s.execError.Load() 1268 // TODO: for now, if any error occurred (including user canceled), checkpoint won't be updated. But if we have put 1269 // optimistic shard info, DM-master may resolved the optimistic lock and let other worker execute DDL. So after this 1270 // worker resume, it can not execute the DML/DDL in old binlog because of downstream table structure mismatching. 1271 // We should find a way to (compensating) implement a transaction containing interaction with both etcd and SQL. 1272 if err != nil && (terror.ErrDBExecuteFailed.Equal(err) || terror.ErrDBUnExpect.Equal(err)) { 1273 s.tctx.L().Warn("error detected when executing SQL job, skip sync flush checkpoints", 1274 zap.Stringer("checkpoint", s.checkpoint), 1275 zap.Error(err)) 1276 return nil 1277 } 1278 1279 snapshotInfo, exceptTables, shardMetaSQLs, shardMetaArgs := s.createCheckpointSnapshot(true) 1280 1281 if snapshotInfo == nil { 1282 s.tctx.L().Debug("checkpoint has no change, skip sync flush checkpoint") 1283 return nil 1284 } 1285 1286 syncFlushErrCh := make(chan error, 1) 1287 1288 task := &checkpointFlushTask{ 1289 snapshotInfo: snapshotInfo, 1290 exceptTables: exceptTables, 1291 shardMetaSQLs: shardMetaSQLs, 1292 shardMetaArgs: shardMetaArgs, 1293 asyncflushJob: nil, 1294 syncFlushErrCh: syncFlushErrCh, 1295 } 1296 s.checkpointFlushWorker.Add(task) 1297 1298 return <-syncFlushErrCh 1299 } 1300 1301 // flushCheckPointsAsync asynchronous flushes checkpoint. 1302 func (s *Syncer) flushCheckPointsAsync(asyncFlushJob *job) { 1303 err := s.execError.Load() 1304 // TODO: for now, if any error occurred (including user canceled), checkpoint won't be updated. But if we have put 1305 // optimistic shard info, DM-master may resolved the optimistic lock and let other worker execute DDL. So after this 1306 // worker resume, it can not execute the DML/DDL in old binlog because of downstream table structure mismatching. 1307 // We should find a way to (compensating) implement a transaction containing interaction with both etcd and SQL. 1308 if err != nil && (terror.ErrDBExecuteFailed.Equal(err) || terror.ErrDBUnExpect.Equal(err)) { 1309 s.tctx.L().Warn("error detected when executing SQL job, skip async flush checkpoints", 1310 zap.Stringer("checkpoint", s.checkpoint), 1311 zap.Error(err)) 1312 return 1313 } 1314 1315 snapshotInfo, exceptTables, shardMetaSQLs, shardMetaArgs := s.createCheckpointSnapshot(false) 1316 1317 if snapshotInfo == nil { 1318 s.tctx.L().Debug("checkpoint has no change, skip async flush checkpoint", zap.Int64("job seq", asyncFlushJob.flushSeq)) 1319 return 1320 } 1321 1322 task := &checkpointFlushTask{ 1323 snapshotInfo: snapshotInfo, 1324 exceptTables: exceptTables, 1325 shardMetaSQLs: shardMetaSQLs, 1326 shardMetaArgs: shardMetaArgs, 1327 asyncflushJob: asyncFlushJob, 1328 syncFlushErrCh: nil, 1329 } 1330 s.checkpointFlushWorker.Add(task) 1331 } 1332 1333 func (s *Syncer) createCheckpointSnapshot(isSyncFlush bool) (*SnapshotInfo, []*filter.Table, []string, [][]interface{}) { 1334 snapshotInfo := s.checkpoint.Snapshot(isSyncFlush) 1335 if snapshotInfo == nil { 1336 return nil, nil, nil, nil 1337 } 1338 1339 var ( 1340 exceptTableIDs map[string]bool 1341 exceptTables []*filter.Table 1342 shardMetaSQLs []string 1343 shardMetaArgs [][]interface{} 1344 ) 1345 if s.cfg.ShardMode == config.ShardPessimistic { 1346 // flush all checkpoints except tables which are unresolved for sharding DDL for the pessimistic mode. 1347 // NOTE: for the optimistic mode, because we don't handle conflicts automatically (or no re-direct supported), 1348 // so we can simply flush checkpoint for all tables now, and after re-direct supported this should be updated. 1349 exceptTableIDs, exceptTables = s.sgk.UnresolvedTables() 1350 s.tctx.L().Info("flush checkpoints except for these tables", zap.Reflect("tables", exceptTables)) 1351 1352 shardMetaSQLs, shardMetaArgs = s.sgk.PrepareFlushSQLs(exceptTableIDs) 1353 s.tctx.L().Info("prepare flush sqls", zap.Strings("shard meta sqls", shardMetaSQLs), zap.Reflect("shard meta arguments", shardMetaArgs)) 1354 } 1355 1356 return snapshotInfo, exceptTables, shardMetaSQLs, shardMetaArgs 1357 } 1358 1359 func (s *Syncer) afterFlushCheckpoint(task *checkpointFlushTask) error { 1360 // add a gc job to let causality module gc outdated kvs. 1361 if task.asyncflushJob != nil { 1362 s.tctx.L().Info("after async flushed checkpoint, gc stale causality keys", zap.Int64("flush job seq", task.asyncflushJob.flushSeq)) 1363 s.addJob(newGCJob(task.asyncflushJob.flushSeq)) 1364 } else { 1365 s.tctx.L().Info("after sync flushed checkpoint, gc all causality keys") 1366 s.addJob(newGCJob(math.MaxInt64)) 1367 } 1368 1369 // update current active relay log after checkpoint flushed 1370 err := s.updateActiveRelayLog(task.snapshotInfo.globalPos.Position) 1371 if err != nil { 1372 return err 1373 } 1374 1375 now := time.Now() 1376 if !s.lastCheckpointFlushedTime.IsZero() { 1377 duration := now.Sub(s.lastCheckpointFlushedTime).Seconds() 1378 s.metricsProxies.Metrics.FlushCheckPointsTimeInterval.Observe(duration) 1379 } 1380 s.lastCheckpointFlushedTime = now 1381 1382 s.logAndClearFilteredStatistics() 1383 1384 if s.cliArgs != nil && s.cliArgs.StartTime != "" && s.cli != nil { 1385 clone := *s.cliArgs 1386 clone.StartTime = "" 1387 err2 := ha.PutTaskCliArgs(s.cli, s.cfg.Name, []string{s.cfg.SourceID}, clone) 1388 if err2 != nil { 1389 s.tctx.L().Error("failed to clean start-time in task cli args", zap.Error(err2)) 1390 } else { 1391 s.Lock() 1392 s.cliArgs.StartTime = "" 1393 s.Unlock() 1394 } 1395 } 1396 return nil 1397 } 1398 1399 func (s *Syncer) logAndClearFilteredStatistics() { 1400 filteredInsert := s.filteredInsert.Swap(0) 1401 filteredUpdate := s.filteredUpdate.Swap(0) 1402 filteredDelete := s.filteredDelete.Swap(0) 1403 if filteredInsert > 0 || filteredUpdate > 0 || filteredDelete > 0 { 1404 s.tctx.L().Info("after last flushing checkpoint, DM has ignored row changes by expression filter", 1405 zap.Int64("number of filtered insert", filteredInsert), 1406 zap.Int64("number of filtered update", filteredUpdate), 1407 zap.Int64("number of filtered delete", filteredDelete)) 1408 } 1409 } 1410 1411 // DDL synced one by one, so we only need to process one DDL at a time. 1412 func (s *Syncer) syncDDL(queueBucket string, db *dbconn.DBConn, ddlJobChan chan *job) { 1413 defer s.runWg.Done() 1414 1415 var err error 1416 for { 1417 ddlJob, ok := <-ddlJobChan 1418 if !ok { 1419 return 1420 } 1421 1422 // add this ddl ts beacause we start to exec this ddl. 1423 s.updateReplicationJobTS(ddlJob, ddlJobIdx) 1424 1425 failpoint.Inject("BlockDDLJob", func(v failpoint.Value) { 1426 t := v.(int) // sleep time 1427 s.tctx.L().Info("BlockDDLJob", zap.Any("job", ddlJob), zap.Int("sleep time", t)) 1428 time.Sleep(time.Second * time.Duration(t)) 1429 }) 1430 1431 var ( 1432 ignore = false 1433 shardPessimistOp *pessimism.Operation 1434 ) 1435 switch s.cfg.ShardMode { 1436 case config.ShardPessimistic: 1437 shardPessimistOp = s.pessimist.PendingOperation() 1438 if shardPessimistOp != nil && !shardPessimistOp.Exec { 1439 ignore = true 1440 s.tctx.L().Info("ignore shard DDLs in pessimistic shard mode", zap.Strings("ddls", ddlJob.ddls)) 1441 } 1442 case config.ShardOptimistic: 1443 if len(ddlJob.ddls) == 0 { 1444 ignore = true 1445 s.tctx.L().Info("ignore shard DDLs in optimistic mode", zap.Stringer("info", s.optimist.PendingInfo())) 1446 } 1447 } 1448 1449 failpoint.Inject("ExecDDLError", func() { 1450 s.tctx.L().Warn("execute ddl error", zap.Strings("DDL", ddlJob.ddls), zap.String("failpoint", "ExecDDLError")) 1451 err = terror.ErrDBUnExpect.Delegate(errors.Errorf("execute ddl %v error", ddlJob.ddls)) 1452 failpoint.Goto("bypass") 1453 }) 1454 1455 if !ignore { 1456 failpoint.Inject("SkipSaveGlobalPoint", func() { 1457 s.tctx.L().Info("skip save global point", zap.String("failpoint", "SkipSaveGlobalPoint")) 1458 panic("SkipSaveGlobalPoint") 1459 }) 1460 // set timezone 1461 if ddlJob.timezone != "" { 1462 s.timezoneLastTime = ddlJob.timezone 1463 setTimezoneSQL := fmt.Sprintf("SET SESSION TIME_ZONE = '%s'", ddlJob.timezone) 1464 ddlJob.ddls = append([]string{setTimezoneSQL}, ddlJob.ddls...) 1465 setTimezoneSQLDefault := "SET SESSION TIME_ZONE = DEFAULT" 1466 ddlJob.ddls = append(ddlJob.ddls, setTimezoneSQLDefault) 1467 } else if s.timezoneLastTime != "" { 1468 // use last time's time zone 1469 setTimezoneSQL := fmt.Sprintf("SET SESSION TIME_ZONE = '%s'", s.timezoneLastTime) 1470 ddlJob.ddls = append([]string{setTimezoneSQL}, ddlJob.ddls...) 1471 setTimezoneSQLDefault := "SET SESSION TIME_ZONE = DEFAULT" 1472 ddlJob.ddls = append(ddlJob.ddls, setTimezoneSQLDefault) 1473 } 1474 // set timestamp 1475 setTimestampSQL := fmt.Sprintf("SET TIMESTAMP = %d", ddlJob.timestamp) 1476 ddlJob.ddls = append([]string{setTimestampSQL}, ddlJob.ddls...) 1477 setTimestampSQLDefault := "SET TIMESTAMP = DEFAULT" 1478 ddlJob.ddls = append(ddlJob.ddls, setTimestampSQLDefault) 1479 1480 var affected int 1481 var ddlCreateTime int64 = -1 // default when scan failed 1482 row, err2 := db.QuerySQL(s.syncCtx, s.metricsProxies, "SELECT UNIX_TIMESTAMP()") 1483 if err2 != nil { 1484 s.tctx.L().Warn("selecting unix timestamp failed", zap.Error(err2)) 1485 } else { 1486 for row.Next() { 1487 err2 = row.Scan(&ddlCreateTime) 1488 if err2 != nil { 1489 s.tctx.L().Warn("getting ddlCreateTime failed", zap.Error(err2)) 1490 } 1491 } 1492 //nolint:sqlclosecheck 1493 _ = row.Close() 1494 _ = row.Err() 1495 } 1496 affected, err = db.ExecuteSQLWithIgnore(s.syncCtx, s.metricsProxies, errorutil.IsIgnorableMySQLDDLError, ddlJob.ddls) 1497 failpoint.Inject("TestHandleSpecialDDLError", func() { 1498 err = mysql2.ErrInvalidConn 1499 // simulate the value of affected along with the injected error due to the adding of SET SQL of timezone and timestamp 1500 if affected == 0 { 1501 affected++ 1502 } 1503 }) 1504 if err != nil { 1505 err = s.handleSpecialDDLError(s.syncCtx, err, ddlJob.ddls, affected, db, ddlCreateTime) 1506 err = terror.WithScope(err, terror.ScopeDownstream) 1507 } 1508 } 1509 failpoint.Label("bypass") 1510 failpoint.Inject("SafeModeExit", func(val failpoint.Value) { 1511 if intVal, ok := val.(int); ok && (intVal == 2 || intVal == 3) { 1512 s.tctx.L().Warn("mock safe mode error", zap.Strings("DDL", ddlJob.ddls), zap.String("failpoint", "SafeModeExit")) 1513 if intVal == 2 { 1514 err = terror.ErrWorkerDDLLockInfoNotFound.Generatef("DDL info not found") 1515 } else { 1516 err = terror.ErrDBExecuteFailed.Delegate(errors.Errorf("execute ddl %v error", ddlJob.ddls)) 1517 } 1518 } 1519 }) 1520 // If downstream has error (which may cause by tracker is more compatible than downstream), we should stop handling 1521 // this job, set `s.execError` to let caller of `addJob` discover error 1522 if err != nil { 1523 s.execError.Store(err) 1524 if !utils.IsContextCanceledError(err) { 1525 err = s.handleEventError(err, ddlJob.startLocation, ddlJob.currentLocation, true, ddlJob.originSQL) 1526 s.runFatalChan <- unit.NewProcessError(err) 1527 } 1528 s.jobWg.Done() 1529 continue 1530 } 1531 1532 switch s.cfg.ShardMode { 1533 case config.ShardPessimistic: 1534 // for sharding DDL syncing, send result back 1535 shardInfo := s.pessimist.PendingInfo() 1536 switch { 1537 case shardInfo == nil: 1538 // no need to do the shard DDL handle for `CREATE DATABASE/TABLE` now. 1539 s.tctx.L().Warn("skip shard DDL handle in pessimistic shard mode", zap.Strings("ddl", ddlJob.ddls)) 1540 case shardPessimistOp == nil: 1541 err = terror.ErrWorkerDDLLockOpNotFound.Generate(shardInfo) 1542 default: 1543 err = s.pessimist.DoneOperationDeleteInfo(*shardPessimistOp, *shardInfo) 1544 } 1545 case config.ShardOptimistic: 1546 shardInfo := s.optimist.PendingInfo() 1547 switch { 1548 case shardInfo == nil: 1549 // no need to do the shard DDL handle for `DROP DATABASE/TABLE` now. 1550 // but for `CREATE DATABASE` and `ALTER DATABASE` we execute it to the downstream directly without `shardInfo`. 1551 if ignore { // actually ignored. 1552 s.tctx.L().Warn("skip shard DDL handle in optimistic shard mode", zap.Strings("ddl", ddlJob.ddls)) 1553 } 1554 case s.optimist.PendingOperation() == nil: 1555 err = terror.ErrWorkerDDLLockOpNotFound.Generate(shardInfo) 1556 default: 1557 err = s.optimist.DoneOperation(*(s.optimist.PendingOperation())) 1558 } 1559 } 1560 if err != nil { 1561 if s.execError.Load() == nil { 1562 s.execError.Store(err) 1563 } 1564 if !utils.IsContextCanceledError(err) { 1565 err = s.handleEventError(err, ddlJob.startLocation, ddlJob.currentLocation, true, ddlJob.originSQL) 1566 s.runFatalChan <- unit.NewProcessError(err) 1567 } 1568 s.jobWg.Done() 1569 continue 1570 } 1571 s.jobWg.Done() 1572 s.updateJobMetrics(true, queueBucket, ddlJob) 1573 } 1574 } 1575 1576 func (s *Syncer) successFunc(queueID int, statementsCnt int, jobs []*job) { 1577 queueBucket := queueBucketName(queueID) 1578 if len(jobs) > 0 { 1579 // NOTE: we can use the first job of job queue to calculate lag because when this job committed, 1580 // every event before this job's event in this queue has already commit. 1581 // and we can use this job to maintain the oldest binlog event ts among all workers. 1582 j := jobs[0] 1583 switch j.tp { 1584 case ddl: 1585 s.metricsProxies.BinlogEventCost.WithLabelValues(metrics.BinlogEventCostStageDDLExec, s.cfg.Name, s.cfg.WorkerName, s.cfg.SourceID).Observe(time.Since(j.jobAddTime).Seconds()) 1586 case dml: 1587 s.metricsProxies.BinlogEventCost.WithLabelValues(metrics.BinlogEventCostStageDMLExec, s.cfg.Name, s.cfg.WorkerName, s.cfg.SourceID).Observe(time.Since(j.jobAddTime).Seconds()) 1588 // metric only increases by 1 because dm batches sql jobs in a single transaction. 1589 s.metricsProxies.Metrics.FinishedTransactionTotal.Inc() 1590 } 1591 } 1592 1593 for _, sqlJob := range jobs { 1594 s.updateJobMetrics(true, queueBucket, sqlJob) 1595 } 1596 s.updateReplicationJobTS(nil, dmlWorkerJobIdx(queueID)) 1597 s.metricsProxies.ReplicationTransactionBatch.WithLabelValues(s.cfg.WorkerName, s.cfg.Name, s.cfg.SourceID, queueBucket, "statements").Observe(float64(statementsCnt)) 1598 s.metricsProxies.ReplicationTransactionBatch.WithLabelValues(s.cfg.WorkerName, s.cfg.Name, s.cfg.SourceID, queueBucket, "rows").Observe(float64(len(jobs))) 1599 } 1600 1601 func (s *Syncer) fatalFunc(job *job, err error) { 1602 s.execError.Store(err) 1603 if !utils.IsContextCanceledError(err) { 1604 err = s.handleEventError(err, job.startLocation, job.currentLocation, false, "") 1605 s.runFatalChan <- unit.NewProcessError(err) 1606 } 1607 } 1608 1609 // DML synced with causality. 1610 func (s *Syncer) syncDML() { 1611 defer s.runWg.Done() 1612 1613 dmlJobCh := s.dmlJobCh 1614 if s.cfg.Compact { 1615 dmlJobCh = compactorWrap(dmlJobCh, s) 1616 } 1617 causalityCh := causalityWrap(dmlJobCh, s) 1618 flushCh := dmlWorkerWrap(causalityCh, s) 1619 1620 for range flushCh { 1621 s.jobWg.Done() 1622 } 1623 } 1624 1625 func (s *Syncer) waitBeforeRunExit(ctx context.Context) { 1626 defer s.runWg.Done() 1627 failpoint.Inject("checkCheckpointInMiddleOfTransaction", func() { 1628 s.tctx.L().Info("incr maxPauseOrStopWaitTime time ") 1629 defaultMaxPauseOrStopWaitTime = time.Minute * 10 1630 }) 1631 select { 1632 case <-ctx.Done(): // hijack the root context from s.Run to wait for the transaction to end. 1633 s.tctx.L().Info("received subtask's done, try graceful stop") 1634 needToExitTime := time.Now() 1635 s.waitTransactionLock.Lock() 1636 1637 failpoint.Inject("checkWaitDuration", func(_ failpoint.Value) { 1638 s.isTransactionEnd = false 1639 }) 1640 if s.isTransactionEnd { 1641 s.waitXIDJob.Store(int64(waitComplete)) 1642 s.waitTransactionLock.Unlock() 1643 s.tctx.L().Info("the last job is transaction end, done directly") 1644 s.runCancel() 1645 return 1646 } 1647 s.waitXIDJob.Store(int64(waiting)) 1648 s.waitTransactionLock.Unlock() 1649 s.refreshCliArgs() 1650 1651 waitDuration := defaultMaxPauseOrStopWaitTime 1652 if s.cliArgs != nil && s.cliArgs.WaitTimeOnStop != "" { 1653 waitDuration, _ = time.ParseDuration(s.cliArgs.WaitTimeOnStop) 1654 } 1655 prepareForWaitTime := time.Since(needToExitTime) 1656 failpoint.Inject("recordAndIgnorePrepareTime", func() { 1657 prepareForWaitTime = time.Duration(0) 1658 }) 1659 waitDuration -= prepareForWaitTime 1660 failpoint.Inject("recordAndIgnorePrepareTime", func() { 1661 waitBeforeRunExitDurationForTest = waitDuration 1662 }) 1663 if waitDuration.Seconds() <= 0 { 1664 s.tctx.L().Info("wait transaction end timeout, exit now") 1665 s.runCancel() 1666 return 1667 } 1668 failpoint.Inject("checkWaitDuration", func(val failpoint.Value) { 1669 if testDuration, testError := time.ParseDuration(val.(string)); testError == nil { 1670 if testDuration.Seconds() == waitDuration.Seconds() { 1671 panic("success check wait_time_on_stop !!!") 1672 } else { 1673 s.tctx.L().Error("checkWaitDuration fail", zap.Duration("testDuration", testDuration), zap.Duration("waitDuration", waitDuration)) 1674 } 1675 } else { 1676 s.tctx.L().Error("checkWaitDuration error", zap.Error(testError)) 1677 } 1678 }) 1679 1680 select { 1681 case <-s.runCtx.Ctx.Done(): 1682 s.tctx.L().Info("syncer run exit so runCtx done") 1683 case <-time.After(waitDuration): 1684 s.tctx.L().Info("wait transaction end timeout, exit now") 1685 s.runCancel() 1686 } 1687 case <-s.runCtx.Ctx.Done(): // when no graceful stop, run ctx will canceled first. 1688 s.tctx.L().Info("received ungraceful exit ctx, exit now") 1689 } 1690 } 1691 1692 func (s *Syncer) updateTSOffsetCronJob(ctx context.Context) { 1693 defer s.runWg.Done() 1694 // temporarily hard code there. if this Proxies works well add this to config file. 1695 ticker := time.NewTicker(time.Minute * 10) 1696 defer ticker.Stop() 1697 for { 1698 select { 1699 case <-ticker.C: 1700 if utErr := s.updateTSOffset(ctx); utErr != nil { 1701 s.tctx.L().Error("get server unix ts err", zap.Error(utErr)) 1702 } 1703 case <-ctx.Done(): 1704 return 1705 } 1706 } 1707 } 1708 1709 func (s *Syncer) updateLagCronJob(ctx context.Context) { 1710 defer s.runWg.Done() 1711 // temporarily hard code there. if this Proxies works well add this to config file. 1712 ticker := time.NewTicker(time.Millisecond * 100) 1713 defer ticker.Stop() 1714 for { 1715 select { 1716 case <-ticker.C: 1717 s.updateReplicationLagMetric() 1718 case <-ctx.Done(): 1719 return 1720 } 1721 } 1722 } 1723 1724 func (s *Syncer) updateTSOffset(ctx context.Context) error { 1725 t1 := time.Now() 1726 ts, tsErr := s.fromDB.GetServerUnixTS(ctx) 1727 rtt := time.Since(t1).Seconds() 1728 if tsErr == nil { 1729 s.tsOffset.Store(time.Now().Unix() - ts - int64(rtt/2)) 1730 } 1731 return tsErr 1732 } 1733 1734 // Run starts running for sync, we should guarantee it can rerun when paused. 1735 func (s *Syncer) Run(ctx context.Context) (err error) { 1736 runCtx, runCancel := context.WithCancel(context.Background()) 1737 s.runCtx, s.runCancel = tcontext.NewContext(runCtx, s.tctx.L()), runCancel 1738 syncCtx, syncCancel := context.WithCancel(context.Background()) 1739 s.syncCtx, s.syncCancel = tcontext.NewContext(syncCtx, s.tctx.L()), syncCancel 1740 defer func() { 1741 s.runCancel() 1742 s.closeJobChans() 1743 s.checkpointFlushWorker.Close() 1744 s.runWg.Wait() 1745 // s.syncCancel won't be called when normal exit, this call just to follow the best practice of use context. 1746 s.syncCancel() 1747 }() 1748 1749 // we should start this goroutine as soon as possible, because it's the only goroutine that cancel syncer.Run 1750 s.runWg.Add(1) 1751 go func() { 1752 s.waitBeforeRunExit(ctx) 1753 }() 1754 1755 // before sync run, we get the ts offset from upstream first 1756 if utErr := s.updateTSOffset(ctx); utErr != nil { 1757 return utErr 1758 } 1759 1760 // some initialization that can't be put in Syncer.Init 1761 fresh, err := s.IsFreshTask(s.runCtx.Ctx) 1762 if err != nil { 1763 return err 1764 } 1765 1766 // task command line arguments have the highest priority 1767 skipLoadMeta := false 1768 s.refreshCliArgs() 1769 if s.cliArgs != nil && s.cliArgs.StartTime != "" { 1770 err = s.setGlobalPointByTime(s.runCtx, s.cliArgs.StartTime) 1771 if terror.ErrConfigStartTimeTooLate.Equal(err) { 1772 return err 1773 } 1774 skipLoadMeta = err == nil 1775 } 1776 1777 // some initialization that can't be put in Syncer.Init 1778 if fresh && !skipLoadMeta { 1779 // for fresh task, we try to load checkpoints from meta (file or config item) 1780 err = s.checkpoint.LoadMeta(runCtx) 1781 if err != nil { 1782 return err 1783 } 1784 } 1785 1786 var ( 1787 flushCheckpoint bool 1788 delLoadTask bool 1789 cleanDumpFile = s.cfg.CleanDumpFile 1790 freshAndAllMode bool 1791 ) 1792 flushCheckpoint, err = s.adjustGlobalPointGTID(s.runCtx) 1793 if err != nil { 1794 return err 1795 } 1796 if fresh && config.HasLoad(s.cfg.Mode) { 1797 delLoadTask = true 1798 flushCheckpoint = true 1799 freshAndAllMode = true 1800 } 1801 1802 if s.cfg.Mode == config.ModeIncrement || !fresh { 1803 cleanDumpFile = false 1804 } 1805 1806 s.runWg.Add(1) 1807 go s.syncDML() 1808 s.runWg.Add(1) 1809 go func() { 1810 defer s.runWg.Done() 1811 // also need to use a different ctx. checkpointFlushWorker worker will be closed in the first defer 1812 s.checkpointFlushWorker.Run(s.tctx) 1813 }() 1814 s.runWg.Add(1) 1815 go s.syncDDL(adminQueueName, s.ddlDBConn, s.ddlJobCh) 1816 s.runWg.Add(1) 1817 go s.updateLagCronJob(s.runCtx.Ctx) 1818 s.runWg.Add(1) 1819 go s.updateTSOffsetCronJob(s.runCtx.Ctx) 1820 1821 // some prepare work before the binlog event loop: 1822 // 1. first we flush checkpoint as needed, so in next resume we won't go to Load unit. 1823 // 2. then since we are confident that Load unit is done we can delete the load task etcd KV. 1824 // TODO: we can't handle panic between 1. and 2., or fail to delete the load task etcd KV. 1825 // 3. then we initiate schema tracker 1826 // 4. - when it's a fresh task, load the table structure from dump files into schema tracker, 1827 // and flush them into checkpoint again. 1828 // - when it's a resumed task, load the table structure from checkpoints into schema tracker. 1829 // TODO: we can't handle failure between 1. and 4. After 1. it's not a fresh task. 1830 // 5. finally clean the dump files 1831 1832 if flushCheckpoint { 1833 if err = s.flushCheckPoints(); err != nil { 1834 s.tctx.L().Warn("fail to flush checkpoints when starting task", zap.Error(err)) 1835 return err 1836 } 1837 } 1838 if delLoadTask { 1839 if err = s.delLoadTask(); err != nil { 1840 s.tctx.L().Error("error when del load task in etcd", zap.Error(err)) 1841 } 1842 } 1843 1844 if s.schemaTracker == nil { 1845 // some test will set their own schemaTracker and skip the syncer.Init 1846 s.schemaTracker = schema.NewTracker() 1847 } 1848 // prevent creating new Tracker on `Run` in order to avoid 1849 // two different Trackers are invoked in the validator and the syncer. 1850 err = s.schemaTracker.Init(ctx, s.cfg.Name, int(s.SourceTableNamesFlavor), s.downstreamTrackConn, s.tctx.L()) 1851 if err != nil { 1852 return terror.ErrSchemaTrackerInit.Delegate(err) 1853 } 1854 1855 if freshAndAllMode { 1856 err = s.loadTableStructureFromDump(ctx) 1857 if err != nil { 1858 s.tctx.L().Warn("error happened when load table structure from dump files", zap.Error(err)) 1859 cleanDumpFile = false 1860 } else { 1861 err = s.flushCheckPoints() 1862 if err != nil { 1863 s.tctx.L().Warn("error happened when flush table structure from dump files", zap.Error(err)) 1864 cleanDumpFile = false 1865 } 1866 } 1867 } else { 1868 err = s.checkpoint.LoadIntoSchemaTracker(ctx, s.schemaTracker) 1869 if err != nil { 1870 return err 1871 } 1872 } 1873 1874 if cleanDumpFile { 1875 s.tctx.L().Info("try to remove all dump files") 1876 if err = storage.RemoveAll(ctx, s.cfg.Dir, nil); err != nil { 1877 s.tctx.L().Warn("error when remove loaded dump folder", zap.String("data folder", s.cfg.Dir), zap.Error(err)) 1878 } 1879 } 1880 1881 failpoint.Inject("AdjustGTIDExit", func() { 1882 s.tctx.L().Warn("exit triggered", zap.String("failpoint", "AdjustGTIDExit")) 1883 s.streamerController.Close() 1884 utils.OsExit(1) 1885 }) 1886 1887 var ( 1888 endLocation = s.checkpoint.GlobalPoint() // also init to global checkpoint 1889 lastTxnEndLocation = s.checkpoint.GlobalPoint() 1890 1891 currentGTID string 1892 ) 1893 s.tctx.L().Info("replicate binlog from checkpoint", zap.Stringer("checkpoint", lastTxnEndLocation)) 1894 1895 if s.streamerController.IsClosed() { 1896 err = s.streamerController.Start(s.runCtx, lastTxnEndLocation) 1897 if err != nil { 1898 return terror.Annotate(err, "fail to restart streamer controller") 1899 } 1900 } 1901 // syncing progress with sharding DDL group 1902 // 1. use the global streamer to sync regular binlog events 1903 // 2. sharding DDL synced for some sharding groups 1904 // * record first pos, last pos, target schema, target table as re-sync info 1905 // 3. use the re-sync info recorded in step.2 to create a new streamer 1906 // 4. use the new streamer re-syncing for this sharding group 1907 // 5. in sharding group's re-syncing 1908 // * ignore other tables' binlog events 1909 // * compare last pos with current binlog's pos to determine whether re-sync completed 1910 // 6. use the global streamer to continue the syncing 1911 var ( 1912 shardingReSyncCh = make(chan *ShardingReSync, 10) 1913 shardingReSync *ShardingReSync 1914 savedGlobalLastLocation binlog.Location 1915 traceSource = fmt.Sprintf("%s.syncer.%s", s.cfg.SourceID, s.cfg.Name) 1916 lastEvent *replication.BinlogEvent 1917 ) 1918 1919 // this is second defer func in syncer.Run so in this time checkpointFlushWorker are still running 1920 defer func() { 1921 if err1 := recover(); err1 != nil { 1922 failpoint.Inject("ExitAfterSaveOnlineDDL", func() { 1923 s.tctx.L().Info("force panic") 1924 panic("ExitAfterSaveOnlineDDL") 1925 }) 1926 s.tctx.L().Error("panic log", zap.Reflect("error message", err1), zap.Stack("stack")) 1927 err = terror.ErrSyncerUnitPanic.Generate(err1) 1928 } 1929 1930 var ( 1931 err2 error 1932 exitSafeModeLoc = endLocation 1933 ) 1934 saveExitSafeModeLoc := func(loc binlog.Location) { 1935 if binlog.CompareLocation(loc, exitSafeModeLoc, s.cfg.EnableGTID) > 0 { 1936 exitSafeModeLoc = loc 1937 } 1938 } 1939 saveExitSafeModeLoc(savedGlobalLastLocation) 1940 if s.cfg.ShardMode == config.ShardOptimistic { 1941 if shardingReSync != nil { 1942 saveExitSafeModeLoc(shardingReSync.latestLocation) 1943 } 1944 for _, latestLoc := range s.osgk.getShardingResyncs() { 1945 saveExitSafeModeLoc(latestLoc) 1946 } 1947 } 1948 s.checkpoint.SaveSafeModeExitPoint(&exitSafeModeLoc) 1949 1950 // flush all jobs before exit 1951 if err2 = s.flushJobs(); err2 != nil { 1952 s.tctx.L().Warn("failed to flush jobs when exit task", zap.Error(err2)) 1953 } 1954 1955 // if any execute error, flush safemode exit point 1956 if err2 = s.execError.Load(); err2 != nil && (terror.ErrDBExecuteFailed.Equal(err2) || terror.ErrDBUnExpect.Equal(err2)) { 1957 if err2 = s.checkpoint.FlushSafeModeExitPoint(s.tctx); err2 != nil { 1958 s.tctx.L().Warn("failed to flush safe mode checkpoints when exit task", zap.Error(err2)) 1959 } 1960 } 1961 }() 1962 1963 now := time.Now() 1964 s.start.Store(now) 1965 s.lastTime.Store(now) 1966 1967 s.initInitExecutedLoc() 1968 s.running.Store(true) 1969 defer s.running.Store(false) 1970 1971 // safeMode makes syncer reentrant. 1972 // we make each operator reentrant to make syncer reentrant. 1973 // `replace` and `delete` are naturally reentrant. 1974 // use `delete`+`replace` to represent `update` can make `update` reentrant. 1975 // but there are no ways to make `update` idempotent, 1976 // if we start syncer at an early position, database must bear a period of inconsistent state, 1977 // it's eventual consistency. 1978 s.enableSafeModeInitializationPhase(s.runCtx) 1979 1980 closeShardingResync := func() error { 1981 if shardingReSync == nil { 1982 return nil 1983 } 1984 1985 var ( 1986 err2 error 1987 nextLocation = savedGlobalLastLocation 1988 ) 1989 1990 // if remaining DDLs in sequence, redirect global stream to the next sharding DDL position 1991 if !shardingReSync.allResolved { 1992 nextLocation, err2 = s.sgk.ActiveDDLFirstLocation(shardingReSync.targetTable) 1993 if err2 != nil { 1994 return err2 1995 } 1996 } 1997 1998 err3 := s.streamerController.ResetReplicationSyncer(s.tctx, nextLocation) 1999 if err3 != nil { 2000 return err3 2001 } 2002 2003 shardingReSync = nil 2004 return nil 2005 } 2006 2007 maybeSkipNRowsEvent := func(n int) error { 2008 if n == 0 { 2009 return nil 2010 } 2011 2012 for i := 0; i < n; { 2013 e, _, err1 := s.streamerController.GetEvent(s.runCtx) 2014 if err1 != nil { 2015 return err 2016 } 2017 switch e.Event.(type) { 2018 case *replication.GTIDEvent, *replication.MariadbGTIDEvent: 2019 gtidStr, err2 := event.GetGTIDStr(e) 2020 if err2 != nil { 2021 return err2 2022 } 2023 if currentGTID != gtidStr { 2024 s.tctx.L().Error("after recover GTID-based replication, the first GTID is not same as broken one. May meet duplicate entry or corrupt data if table has no PK/UK.", 2025 zap.String("last GTID", currentGTID), 2026 zap.String("GTID after reset", gtidStr), 2027 ) 2028 return nil 2029 } 2030 case *replication.RowsEvent: 2031 i++ 2032 } 2033 } 2034 s.tctx.L().Info("discard event already consumed", zap.Int("count", n), 2035 zap.Any("start location", s.streamerController.GetCurStartLocation()), 2036 zap.Any("end location", s.streamerController.GetCurEndLocation())) 2037 return nil 2038 } 2039 2040 // eventIndex is the rows event index in this transaction, it's used to avoiding read duplicate event in gtid mode 2041 eventIndex := 0 2042 // affectedSourceTables is used for gtid mode to update table point's gtid set after receiving a xid event, 2043 // which means this whole event is finished 2044 affectedSourceTables := make(map[string]map[string]struct{}, 0) 2045 // the relay log file may be truncated(not end with an RotateEvent), in this situation, we may read some rows events 2046 // and then read from the gtid again, so we force enter safe-mode for one more transaction to avoid failure due to 2047 // conflict 2048 2049 // binlog event loop 2050 for { 2051 if s.execError.Load() != nil { 2052 return nil 2053 } 2054 s.currentLocationMu.Lock() 2055 s.currentLocationMu.currentLocation = endLocation 2056 s.currentLocationMu.Unlock() 2057 2058 failpoint.Inject("FakeRedirect", func(val failpoint.Value) { 2059 if len(shardingReSyncCh) == 0 && shardingReSync == nil { 2060 if strVal, ok := val.(string); ok { 2061 pos, gtidSet, err2 := s.fromDB.GetMasterStatus(s.tctx.WithContext(ctx), s.cfg.Flavor) 2062 if err2 != nil { 2063 s.tctx.L().Error("fail to get master status in failpoint FakeRedirect", zap.Error(err2)) 2064 os.Exit(1) 2065 } 2066 loc := binlog.NewLocation(pos, gtidSet) 2067 s.tctx.L().Info("fake redirect", zap.Stringer("endLocation", endLocation), zap.Stringer("latestLocation", loc)) 2068 resync := &ShardingReSync{ 2069 currLocation: endLocation.Clone(), 2070 latestLocation: loc, 2071 targetTable: utils.UnpackTableID(strVal), 2072 allResolved: true, 2073 } 2074 shardingReSyncCh <- resync 2075 } 2076 } 2077 }) 2078 // fetch from sharding resync channel if needed, and redirect global 2079 // stream to current binlog position recorded by ShardingReSync 2080 if len(shardingReSyncCh) > 0 && shardingReSync == nil { 2081 // some sharding groups need to re-syncing 2082 shardingReSync = <-shardingReSyncCh 2083 s.tctx.L().Debug("starts to handle new shardingResync operation", zap.Stringer("shardingResync", shardingReSync)) 2084 savedGlobalLastLocation = lastTxnEndLocation // save global last location 2085 lastTxnEndLocation = shardingReSync.currLocation 2086 2087 // remove sharding resync global checkpoint location limit from optimistic sharding group keeper 2088 if s.cfg.ShardMode == config.ShardOptimistic { 2089 s.osgk.removeShardingReSync(shardingReSync) 2090 } 2091 2092 err = s.streamerController.ResetReplicationSyncer(s.runCtx, shardingReSync.currLocation) 2093 if err != nil { 2094 return err 2095 } 2096 2097 failpoint.Inject("ReSyncExit", func() { 2098 s.tctx.L().Warn("exit triggered", zap.String("failpoint", "ReSyncExit")) 2099 utils.OsExit(1) 2100 }) 2101 } 2102 // for position mode, we can redirect at any time 2103 // for gtid mode, we can redirect only when current location related gtid's transaction is totally executed and 2104 // next gtid is just updated (because we check if we can end resync by currLoc >= latestLoc) 2105 if s.cfg.ShardMode == config.ShardOptimistic { 2106 canRedirect := true 2107 if s.cfg.EnableGTID { 2108 canRedirect = safeToRedirect(lastEvent) 2109 } else if lastEvent != nil { 2110 if _, ok := lastEvent.Event.(*replication.QueryEvent); ok { 2111 if op := s.optimist.PendingOperation(); op != nil && op.ConflictStage == optimism.ConflictSkipWaitRedirect { 2112 canRedirect = false // don't redirect here at least after a row event 2113 } 2114 } 2115 } 2116 if canRedirect { 2117 op, targetTableID := s.optimist.PendingRedirectOperation() 2118 if op != nil { 2119 // for optimistic sharding mode, if a new sharding group is resolved when syncer is redirecting, 2120 // instead of using the endLocation, the next redirection should share the same latestLocation with the current shardingResync. 2121 // This is to avoid syncer syncs to current shardingResync.latestLocation before, 2122 // we may miss some rows events if we don't check the row events between endLocation and shardingResync.latestLocation. 2123 // TODO: This will cause a potential performance issue. If we have multiple tables not resolved after a huge amount of binlogs but resolved in a short time, 2124 // current implementation will cause syncer to redirect and replay the binlogs in this segment several times. One possible solution is to 2125 // interrupt current resync once syncer meets a new redirect operation, force other tables to be resolved together in the interrupted shardingResync. 2126 // If we want to do this, we also need to remove the target table check at https://github.com/pingcap/tiflow/blob/af849add84bf26feb2628d3e1e4344830b915fd9/dm/syncer/syncer.go#L2489 2127 endLocation := &endLocation 2128 if shardingReSync != nil { 2129 endLocation = &shardingReSync.latestLocation 2130 } 2131 resolved := s.resolveOptimisticDDL(&eventContext{ 2132 shardingReSyncCh: &shardingReSyncCh, 2133 endLocation: *endLocation, 2134 }, &filter.Table{Schema: op.UpSchema, Name: op.UpTable}, utils.UnpackTableID(targetTableID)) 2135 // if resolved and targetTableID == shardingResync.TargetTableID, we should resolve this resync before we continue 2136 if resolved && shardingReSync != nil && targetTableID == shardingReSync.targetTable.String() { 2137 err = closeShardingResync() 2138 if err != nil { 2139 return err 2140 } 2141 } 2142 continue 2143 } 2144 } 2145 } 2146 2147 startTime := time.Now() 2148 e, op, err := s.streamerController.GetEvent(s.runCtx) 2149 2150 failpoint.Inject("SafeModeExit", func(val failpoint.Value) { 2151 if intVal, ok := val.(int); ok && intVal == 1 { 2152 s.tctx.L().Warn("fail to get event", zap.String("failpoint", "SafeModeExit")) 2153 err = errors.New("connect: connection refused") 2154 } 2155 }) 2156 failpoint.Inject("GetEventErrorInTxn", func(val failpoint.Value) { 2157 if intVal, ok := val.(int); ok && intVal == eventIndex && failOnceForTest.CAS(false, true) { 2158 err = errors.New("failpoint triggered") 2159 s.tctx.L().Warn("failed to get event", zap.Int("event_index", eventIndex), 2160 zap.Any("cur_pos", endLocation), zap.Any("las_pos", lastTxnEndLocation), 2161 zap.Any("pos", e.Header.LogPos), log.ShortError(err)) 2162 } 2163 }) 2164 failpoint.Inject("SleepInTxn", func(val failpoint.Value) { 2165 if intVal, ok := val.(int); ok && intVal == eventIndex && failOnceForTest.CAS(false, true) { 2166 s.tctx.L().Warn("start to sleep 6s and continue for this event", zap.Int("event_index", eventIndex), 2167 zap.Any("cur_pos", endLocation), zap.Any("las_pos", lastTxnEndLocation), 2168 zap.Any("pos", e.Header.LogPos)) 2169 time.Sleep(6 * time.Second) 2170 } 2171 }) 2172 switch { 2173 case err == context.Canceled: 2174 s.tctx.L().Info("binlog replication main routine quit(context canceled)!", zap.Stringer("last location", lastTxnEndLocation)) 2175 return nil 2176 case err == context.DeadlineExceeded: 2177 s.tctx.L().Info("deadline exceeded when fetching binlog event") 2178 continue 2179 // TODO: if we can maintain the lastTxnEndLocation inside streamerController, no need to expose this logic to syncer 2180 case isDuplicateServerIDError(err): 2181 // if the server id is already used, need to use a new server id 2182 s.tctx.L().Info("server id is already used by another slave, will change to a new server id and get event again") 2183 err1 := s.streamerController.UpdateServerIDAndResetReplication(s.tctx, lastTxnEndLocation) 2184 if err1 != nil { 2185 return err1 2186 } 2187 continue 2188 case err == relay.ErrorMaybeDuplicateEvent: 2189 s.tctx.L().Warn("read binlog met a truncated file, will skip events that has been consumed") 2190 err = maybeSkipNRowsEvent(eventIndex) 2191 if err == nil { 2192 continue 2193 } 2194 s.tctx.L().Warn("skip duplicate rows event failed", zap.Error(err)) 2195 } 2196 2197 if err != nil { 2198 s.tctx.L().Error("fail to fetch binlog", log.ShortError(err)) 2199 2200 if isConnectionRefusedError(err) { 2201 return err 2202 } 2203 2204 // TODO: if we can maintain the lastTxnEndLocation inside streamerController, no need to expose this logic to syncer 2205 if s.streamerController.CanRetry(err) { 2206 // GlobalPoint is the last finished transaction location 2207 err = s.streamerController.ResetReplicationSyncer(s.tctx, s.checkpoint.GlobalPoint()) 2208 if err != nil { 2209 return err 2210 } 2211 s.tctx.L().Info("reset replication binlog puller", zap.Any("pos", s.checkpoint.GlobalPoint())) 2212 if err = maybeSkipNRowsEvent(eventIndex); err != nil { 2213 return err 2214 } 2215 continue 2216 } 2217 2218 return terror.ErrSyncerGetEvent.Generate(err) 2219 } 2220 2221 failpoint.Inject("IgnoreSomeTypeEvent", func(val failpoint.Value) { 2222 if e.Header.EventType.String() == val.(string) { 2223 s.tctx.L().Debug("IgnoreSomeTypeEvent", zap.Reflect("event", e)) 2224 failpoint.Continue() 2225 } 2226 }) 2227 2228 // time duration for reading an event from relay log or upstream master. 2229 s.metricsProxies.Metrics.BinlogReadDurationHistogram.Observe(time.Since(startTime).Seconds()) 2230 startTime = time.Now() // reset start time for the next metric. 2231 2232 s.metricsProxies.Metrics.BinlogSyncerPosGauge.Set(float64(e.Header.LogPos)) 2233 index, err := utils.GetFilenameIndex(lastTxnEndLocation.Position.Name) 2234 if err != nil { 2235 s.tctx.L().Warn("fail to get index number of binlog file, may because only specify GTID and hasn't saved according binlog position", log.ShortError(err)) 2236 } else { 2237 s.metricsProxies.Metrics.BinlogSyncerFileGauge.Set(float64(index)) 2238 } 2239 s.binlogSizeCount.Add(int64(e.Header.EventSize)) 2240 s.metricsProxies.Metrics.BinlogEventSizeHistogram.Observe(float64(e.Header.EventSize)) 2241 2242 failpoint.Inject("ProcessBinlogSlowDown", nil) 2243 2244 s.tctx.L().Debug("receive binlog event", zap.Reflect("header", e.Header)) 2245 2246 startLocation := s.streamerController.GetCurStartLocation() 2247 endLocation = s.streamerController.GetCurEndLocation() 2248 lastTxnEndLocation = s.streamerController.GetTxnEndLocation() 2249 2250 if _, ok := e.Event.(*replication.GenericEvent); !ok { 2251 lastEvent = e 2252 } 2253 2254 switch op { 2255 case pb.ErrorOp_Skip: 2256 // try to handle pessimistic sharding? 2257 queryEvent, ok := e.Event.(*replication.QueryEvent) 2258 if !ok { 2259 s.tctx.L().Warn("can't skip an event which is not DDL", zap.Reflect("header", e.Header)) 2260 break 2261 } 2262 ec := eventContext{ 2263 tctx: s.tctx, 2264 header: e.Header, 2265 startLocation: startLocation, 2266 endLocation: endLocation, 2267 lastLocation: lastTxnEndLocation, 2268 } 2269 var sourceTbls map[string]map[string]struct{} 2270 sourceTbls, err = s.trackOriginDDL(queryEvent, ec) 2271 if err != nil { 2272 s.tctx.L().Warn("failed to track query when handle-error skip", zap.Error(err), zap.ByteString("sql", queryEvent.Query)) 2273 } 2274 2275 s.saveGlobalPoint(endLocation) 2276 for sourceSchema, tableMap := range sourceTbls { 2277 if sourceSchema == "" { 2278 continue 2279 } 2280 for sourceTable := range tableMap { 2281 s.saveTablePoint(&filter.Table{Schema: sourceSchema, Name: sourceTable}, endLocation) 2282 } 2283 } 2284 err = s.flushJobs() 2285 if err != nil { 2286 s.tctx.L().Warn("failed to flush jobs when handle-error skip", zap.Error(err)) 2287 } else { 2288 s.tctx.L().Info("flush jobs when handle-error skip") 2289 } 2290 continue 2291 } 2292 2293 // check pass SafeModeExitLoc and try to disable safe mode, but not in sharding or replacing error 2294 safeModeExitLoc := s.checkpoint.SafeModeExitPoint() 2295 if safeModeExitLoc != nil && shardingReSync == nil { 2296 if binlog.CompareLocation(endLocation, *safeModeExitLoc, s.cfg.EnableGTID) > 0 { 2297 s.checkpoint.SaveSafeModeExitPoint(nil) 2298 // must flush here to avoid the following situation: 2299 // 1. quit safe mode 2300 // 2. push forward and replicate some sqls after safeModeExitPoint to downstream 2301 // 3. quit because of network error, fail to flush global checkpoint and new safeModeExitPoint to downstream 2302 // 4. restart again, quit safe mode at safeModeExitPoint, but some sqls after this location have already been replicated to the downstream 2303 if err = s.checkpoint.FlushSafeModeExitPoint(s.runCtx); err != nil { 2304 return err 2305 } 2306 s.tctx.L().Info("disable safe mode in exit point") 2307 if err = s.safeMode.Add(s.runCtx, -1); err != nil { 2308 return err 2309 } 2310 } 2311 } 2312 2313 // set exitSafeModeTS when meet first binlog 2314 if s.firstMeetBinlogTS == nil && s.cliArgs != nil && s.cliArgs.SafeModeDuration != "" && int64(e.Header.Timestamp) != 0 && e.Header.EventType != replication.FORMAT_DESCRIPTION_EVENT { 2315 if checkErr := s.initSafeModeExitTS(int64(e.Header.Timestamp)); checkErr != nil { 2316 return checkErr 2317 } 2318 } 2319 // check if need to exit safe mode by binlog header TS 2320 if s.exitSafeModeTS != nil && shardingReSync == nil { 2321 if checkErr := s.checkAndExitSafeModeByBinlogTS(s.runCtx, int64(e.Header.Timestamp)); checkErr != nil { 2322 return checkErr 2323 } 2324 } 2325 ec := eventContext{ 2326 tctx: s.runCtx, 2327 header: e.Header, 2328 startLocation: startLocation, 2329 endLocation: endLocation, 2330 lastLocation: lastTxnEndLocation, 2331 shardingReSync: shardingReSync, 2332 closeShardingResync: closeShardingResync, 2333 traceSource: traceSource, 2334 safeMode: s.safeMode.Enable(), 2335 startTime: startTime, 2336 shardingReSyncCh: &shardingReSyncCh, 2337 } 2338 2339 var originSQL string // show origin sql when error, only ddl now 2340 var err2 error 2341 var sourceTable *filter.Table 2342 var needContinue bool 2343 var eventType string 2344 2345 funcCommit := func() (bool, error) { 2346 // reset eventIndex and force safeMode flag here. 2347 eventIndex = 0 2348 for schemaName, tableMap := range affectedSourceTables { 2349 for table := range tableMap { 2350 s.saveTablePoint(&filter.Table{Schema: schemaName, Name: table}, endLocation) 2351 } 2352 } 2353 affectedSourceTables = make(map[string]map[string]struct{}) 2354 if shardingReSync != nil { 2355 // TODO: why shardingReSync need to do so, shardingReSync need refactor 2356 shardingReSync.currLocation = endLocation 2357 2358 if binlog.CompareLocation(shardingReSync.currLocation, shardingReSync.latestLocation, s.cfg.EnableGTID) >= 0 { 2359 s.tctx.L().Info("re-replicate shard group was completed", zap.String("event", eventType), zap.Stringer("re-shard", shardingReSync)) 2360 err = closeShardingResync() 2361 if err != nil { 2362 return false, terror.Annotatef(err, "shard group current location %s", shardingReSync.currLocation) 2363 } 2364 return true, nil 2365 } 2366 } 2367 2368 s.tctx.L().Debug("", zap.String("event", eventType), zap.Stringer("last location", lastTxnEndLocation), log.WrapStringerField("location", endLocation)) 2369 2370 job := newXIDJob(endLocation, startLocation, endLocation) 2371 _, err = s.handleJobFunc(job) 2372 return false, err 2373 } 2374 2375 switch ev := e.Event.(type) { 2376 case *replication.RotateEvent: 2377 err2 = s.handleRotateEvent(ev, ec) 2378 case *replication.RowsEvent: 2379 eventIndex++ 2380 s.metricsProxies.Metrics.BinlogEventRowHistogram.Observe(float64(len(ev.Rows))) 2381 sourceTable, err2 = s.handleRowsEvent(ev, ec) 2382 if sourceTable != nil && err2 == nil && s.cfg.EnableGTID { 2383 if _, ok := affectedSourceTables[sourceTable.Schema]; !ok { 2384 affectedSourceTables[sourceTable.Schema] = make(map[string]struct{}) 2385 } 2386 affectedSourceTables[sourceTable.Schema][sourceTable.Name] = struct{}{} 2387 } 2388 case *replication.QueryEvent: 2389 originSQL = strings.TrimSpace(string(ev.Query)) 2390 if originSQL == "COMMIT" { 2391 eventType = "COMMIT query event" 2392 needContinue, err2 = funcCommit() 2393 if needContinue { 2394 continue 2395 } 2396 } else { 2397 err2 = s.ddlWorker.HandleQueryEvent(ev, ec, originSQL) 2398 } 2399 case *replication.XIDEvent: 2400 eventType = "XID" 2401 needContinue, err2 = funcCommit() 2402 if needContinue { 2403 continue 2404 } 2405 case *replication.GenericEvent: 2406 if e.Header.EventType == replication.HEARTBEAT_EVENT { 2407 // flush checkpoint even if there are no real binlog events 2408 if s.checkpoint.LastFlushOutdated() { 2409 s.tctx.L().Info("meet heartbeat event and then flush jobs") 2410 err2 = s.flushJobs() 2411 } 2412 } 2413 case *replication.GTIDEvent, *replication.MariadbGTIDEvent: 2414 currentGTID, err2 = event.GetGTIDStr(e) 2415 if err2 != nil { 2416 return err2 2417 } 2418 case *replication.TransactionPayloadEvent: 2419 for _, tpev := range ev.Events { 2420 switch tpevt := tpev.Event.(type) { 2421 case *replication.RowsEvent: 2422 eventIndex++ 2423 s.metricsProxies.Metrics.BinlogEventRowHistogram.Observe(float64(len(tpevt.Rows))) 2424 ec.header.EventType = tpev.Header.EventType 2425 sourceTable, err2 = s.handleRowsEvent(tpevt, ec) 2426 if sourceTable != nil && err2 == nil && s.cfg.EnableGTID { 2427 if _, ok := affectedSourceTables[sourceTable.Schema]; !ok { 2428 affectedSourceTables[sourceTable.Schema] = make(map[string]struct{}) 2429 } 2430 affectedSourceTables[sourceTable.Schema][sourceTable.Name] = struct{}{} 2431 } 2432 case *replication.QueryEvent: 2433 originSQL = strings.TrimSpace(string(tpevt.Query)) 2434 err2 = s.ddlWorker.HandleQueryEvent(tpevt, ec, originSQL) 2435 case *replication.XIDEvent: 2436 eventType = "XID" 2437 needContinue, err2 = funcCommit() 2438 case *replication.TableMapEvent: 2439 case *replication.FormatDescriptionEvent: 2440 default: 2441 s.tctx.L().Warn("unhandled event from transaction payload", zap.String("type", fmt.Sprintf("%T", tpevt))) 2442 } 2443 } 2444 if needContinue { 2445 continue 2446 } 2447 case *replication.TableMapEvent: 2448 case *replication.FormatDescriptionEvent: 2449 default: 2450 s.tctx.L().Warn("unhandled event", zap.String("type", fmt.Sprintf("%T", ev))) 2451 } 2452 if err2 != nil { 2453 if err := s.handleEventError(err2, startLocation, endLocation, e.Header.EventType == replication.QUERY_EVENT, originSQL); err != nil { 2454 return err 2455 } 2456 } 2457 if waitXIDStatus(s.waitXIDJob.Load()) == waitComplete { 2458 // already wait until XID event, we can stop sync now, s.runcancel will be called in defer func 2459 return nil 2460 } 2461 } 2462 } 2463 2464 func (s *Syncer) initSafeModeExitTS(firstBinlogTS int64) error { 2465 // see more in https://github.com/pingcap/tiflow/pull/4601#discussion_r814446628 2466 duration, err := time.ParseDuration(s.cliArgs.SafeModeDuration) 2467 if err != nil { 2468 return err 2469 } 2470 s.firstMeetBinlogTS = &firstBinlogTS 2471 exitTS := firstBinlogTS + int64(duration.Seconds()) 2472 s.exitSafeModeTS = &exitTS 2473 s.tctx.L().Info("safe-mode will disable by task cli args", zap.Int64("ts", exitTS)) 2474 return nil 2475 } 2476 2477 func (s *Syncer) checkAndExitSafeModeByBinlogTS(ctx *tcontext.Context, ts int64) error { 2478 if s.exitSafeModeTS != nil && ts > *s.exitSafeModeTS { 2479 if err := s.safeMode.Add(ctx, -1); err != nil { 2480 return err 2481 } 2482 s.exitSafeModeTS = nil 2483 if !s.safeMode.Enable() { 2484 s.tctx.L().Info("safe-mode disable by task cli args", zap.Int64("ts in binlog", ts)) 2485 } 2486 // delete cliArgs in etcd 2487 clone := *s.cliArgs 2488 clone.SafeModeDuration = "" 2489 if err2 := ha.PutTaskCliArgs(s.cli, s.cfg.Name, []string{s.cfg.SourceID}, clone); err2 != nil { 2490 s.tctx.L().Error("failed to clean safe-mode-duration in task cli args", zap.Error(err2)) 2491 } else { 2492 s.Lock() 2493 s.cliArgs.SafeModeDuration = "" 2494 s.Unlock() 2495 } 2496 } 2497 return nil 2498 } 2499 2500 type eventContext struct { 2501 tctx *tcontext.Context 2502 header *replication.EventHeader 2503 startLocation binlog.Location 2504 endLocation binlog.Location 2505 lastLocation binlog.Location 2506 shardingReSync *ShardingReSync 2507 closeShardingResync func() error 2508 traceSource string 2509 // safeMode is the value of syncer.safeMode when process this event 2510 // syncer.safeMode's value may change on the fly, e.g. after event by pass the safeModeExitPoint 2511 safeMode bool 2512 startTime time.Time 2513 shardingReSyncCh *chan *ShardingReSync 2514 } 2515 2516 // TODO: Further split into smaller functions and group common arguments into a context struct. 2517 func (s *Syncer) handleRotateEvent(ev *replication.RotateEvent, ec eventContext) error { 2518 failpoint.Inject("MakeFakeRotateEvent", func(val failpoint.Value) { 2519 ec.header.LogPos = 0 2520 ev.NextLogName = []byte(val.(string)) 2521 ec.tctx.L().Info("MakeFakeRotateEvent", zap.String("fake file name", string(ev.NextLogName))) 2522 }) 2523 2524 if utils.IsFakeRotateEvent(ec.header) { 2525 // when user starts a new task with GTID and no binlog file name, we can't know active relay log at init time 2526 // at this case, we update active relay log when receive fake rotate event 2527 if !s.recordedActiveRelayLog { 2528 if err := s.updateActiveRelayLog(mysql.Position{ 2529 Name: string(ev.NextLogName), 2530 Pos: uint32(ev.Position), 2531 }); err != nil { 2532 ec.tctx.L().Warn("failed to update active relay log, will try to update when flush checkpoint", 2533 zap.ByteString("NextLogName", ev.NextLogName), 2534 zap.Uint64("Position", ev.Position), 2535 zap.Error(err)) 2536 } else { 2537 s.recordedActiveRelayLog = true 2538 } 2539 } 2540 } 2541 2542 if ec.shardingReSync != nil { 2543 if binlog.CompareLocation(ec.endLocation, ec.shardingReSync.currLocation, s.cfg.EnableGTID) > 0 { 2544 ec.shardingReSync.currLocation = ec.endLocation 2545 } 2546 if binlog.CompareLocation(ec.shardingReSync.currLocation, ec.shardingReSync.latestLocation, s.cfg.EnableGTID) >= 0 { 2547 ec.tctx.L().Info("re-replicate shard group was completed", zap.String("event", "rotate"), zap.Stringer("re-shard", ec.shardingReSync)) 2548 err := ec.closeShardingResync() 2549 if err != nil { 2550 return err 2551 } 2552 } else { 2553 ec.tctx.L().Debug("re-replicate shard group", zap.String("event", "rotate"), log.WrapStringerField("location", ec.endLocation), zap.Reflect("re-shard", ec.shardingReSync)) 2554 } 2555 return nil 2556 } 2557 2558 ec.tctx.L().Info("", zap.String("event", "rotate"), log.WrapStringerField("location", ec.endLocation)) 2559 return nil 2560 } 2561 2562 // getFlushSeq is used for assigning an auto-incremental sequence number for each flush job. 2563 func (s *Syncer) getFlushSeq() int64 { 2564 s.flushSeq++ 2565 return s.flushSeq 2566 } 2567 2568 // handleRowsEvent returns (affected table, error), affected table means this table's event is committed to handleJob. 2569 func (s *Syncer) handleRowsEvent(ev *replication.RowsEvent, ec eventContext) (*filter.Table, error) { 2570 sourceTable := &filter.Table{ 2571 Schema: string(ev.Table.Schema), 2572 Name: string(ev.Table.Table), 2573 } 2574 targetTable := s.route(sourceTable) 2575 2576 if ec.shardingReSync != nil { 2577 ec.shardingReSync.currLocation = ec.endLocation 2578 // When current progress has passed the latest location in re-sync, we can stop re-sync now. 2579 if binlog.CompareLocation(ec.shardingReSync.currLocation, ec.shardingReSync.latestLocation, s.cfg.EnableGTID) > 0 { 2580 ec.tctx.L().Info("re-replicate shard group was completed", zap.String("event", "row"), zap.Stringer("re-shard", ec.shardingReSync)) 2581 return nil, ec.closeShardingResync() 2582 } 2583 if ec.shardingReSync.targetTable.String() != targetTable.String() { 2584 // in re-syncing, ignore non current sharding group's events 2585 ec.tctx.L().Debug("skip event in re-replicating shard group", zap.String("event", "row"), zap.Stringer("re-shard", ec.shardingReSync)) 2586 return nil, nil 2587 } 2588 } 2589 2590 // For DML position before table checkpoint, ignore it. When the position equals to table checkpoint, this event may 2591 // be partially replicated to downstream, we rely on safe-mode to handle it. 2592 if s.checkpoint.IsOlderThanTablePoint(sourceTable, ec.endLocation) { 2593 ec.tctx.L().Debug("ignore obsolete event that is old than table checkpoint", 2594 zap.String("event", "row"), 2595 log.WrapStringerField("location", ec.endLocation), 2596 zap.Stringer("source table", sourceTable)) 2597 return nil, nil 2598 } 2599 2600 ec.tctx.L().Debug("", 2601 zap.String("event", "row"), 2602 zap.Stringer("source table", sourceTable), 2603 zap.Stringer("target table", targetTable), 2604 log.WrapStringerField("location", ec.endLocation), 2605 zap.Reflect("raw event data", ev.Rows)) 2606 2607 needSkip, err := s.skipRowsEvent(sourceTable, ec.header.EventType) 2608 if err != nil { 2609 return nil, err 2610 } 2611 if needSkip { 2612 s.metricsProxies.SkipBinlogDurationHistogram.WithLabelValues("rows", s.cfg.Name, s.cfg.SourceID).Observe(time.Since(ec.startTime).Seconds()) 2613 // for RowsEvent, we should record lastLocation rather than endLocation 2614 return nil, s.recordSkipSQLsLocation(&ec) 2615 } 2616 2617 if (s.cfg.ShardMode == config.ShardPessimistic && s.sgk.InSyncing(sourceTable, targetTable, ec.endLocation)) || 2618 (s.cfg.ShardMode == config.ShardOptimistic && s.osgk.inConflictStage(sourceTable, targetTable)) { 2619 // if in unsync stage and not before active DDL, filter it 2620 // if in sharding re-sync stage and not before active DDL (the next DDL to be synced), filter it 2621 ec.tctx.L().Debug("replicate sharding DDL, filter Rows event", 2622 zap.String("event", "row"), 2623 zap.Stringer("source", sourceTable), 2624 log.WrapStringerField("location", ec.endLocation)) 2625 return nil, nil 2626 } 2627 2628 tableInfo, err := s.getTableInfo(ec.tctx, sourceTable, targetTable) 2629 if err != nil { 2630 return nil, terror.WithScope(err, terror.ScopeDownstream) 2631 } 2632 originRows := ev.Rows 2633 if err2 := checkLogColumns(ev.SkippedColumns); err2 != nil { 2634 return nil, err2 2635 } 2636 2637 extRows := generateExtendColumn(originRows, s.tableRouter, sourceTable, s.cfg.SourceID) 2638 2639 var dmls []*sqlmodel.RowChange 2640 2641 param := &genDMLParam{ 2642 targetTable: targetTable, 2643 originalData: originRows, 2644 sourceTableInfo: tableInfo, 2645 sourceTable: sourceTable, 2646 extendData: extRows, 2647 } 2648 2649 switch ec.header.EventType { 2650 case replication.WRITE_ROWS_EVENTv0, replication.WRITE_ROWS_EVENTv1, replication.WRITE_ROWS_EVENTv2: 2651 exprFilter, err2 := s.exprFilterGroup.GetInsertExprs(sourceTable, tableInfo) 2652 if err2 != nil { 2653 return nil, err2 2654 } 2655 2656 param.safeMode = ec.safeMode 2657 dmls, err = s.genAndFilterInsertDMLs(ec.tctx, param, exprFilter) 2658 if err != nil { 2659 return nil, terror.Annotatef(err, "gen insert sqls failed, sourceTable: %v, targetTable: %v", sourceTable, targetTable) 2660 } 2661 s.metricsProxies.BinlogEventCost.WithLabelValues(metrics.BinlogEventCostStageGenWriteRows, s.cfg.Name, s.cfg.WorkerName, s.cfg.SourceID).Observe(time.Since(ec.startTime).Seconds()) 2662 2663 case replication.UPDATE_ROWS_EVENTv0, replication.UPDATE_ROWS_EVENTv1, replication.UPDATE_ROWS_EVENTv2: 2664 oldExprFilter, newExprFilter, err2 := s.exprFilterGroup.GetUpdateExprs(sourceTable, tableInfo) 2665 if err2 != nil { 2666 return nil, err2 2667 } 2668 2669 param.safeMode = ec.safeMode 2670 dmls, err = s.genAndFilterUpdateDMLs(ec.tctx, param, oldExprFilter, newExprFilter) 2671 if err != nil { 2672 return nil, terror.Annotatef(err, "gen update sqls failed, sourceTable: %v, targetTable: %v", sourceTable, targetTable) 2673 } 2674 s.metricsProxies.BinlogEventCost.WithLabelValues(metrics.BinlogEventCostStageGenUpdateRows, s.cfg.Name, s.cfg.WorkerName, s.cfg.SourceID).Observe(time.Since(ec.startTime).Seconds()) 2675 2676 case replication.DELETE_ROWS_EVENTv0, replication.DELETE_ROWS_EVENTv1, replication.DELETE_ROWS_EVENTv2: 2677 exprFilter, err2 := s.exprFilterGroup.GetDeleteExprs(sourceTable, tableInfo) 2678 if err2 != nil { 2679 return nil, err2 2680 } 2681 2682 dmls, err = s.genAndFilterDeleteDMLs(ec.tctx, param, exprFilter) 2683 if err != nil { 2684 return nil, terror.Annotatef(err, "gen delete sqls failed, sourceTable: %v, targetTable: %v", sourceTable, targetTable) 2685 } 2686 s.metricsProxies.BinlogEventCost.WithLabelValues(metrics.BinlogEventCostStageGenDeleteRows, s.cfg.Name, s.cfg.WorkerName, s.cfg.SourceID).Observe(time.Since(ec.startTime).Seconds()) 2687 2688 default: 2689 ec.tctx.L().Debug("ignoring unrecognized event", zap.String("event", "row"), zap.Stringer("type", ec.header.EventType)) 2690 return nil, nil 2691 } 2692 2693 startTime := time.Now() 2694 2695 metricTp := "" 2696 for i := range dmls { 2697 metricTp = dmlMetric[dmls[i].Type()] 2698 job := newDMLJob(dmls[i], &ec) 2699 added2Queue, err2 := s.handleJobFunc(job) 2700 if err2 != nil || !added2Queue { 2701 return nil, err2 2702 } 2703 } 2704 s.metricsProxies.DispatchBinlogDurationHistogram.WithLabelValues(metricTp, s.cfg.Name, s.cfg.SourceID).Observe(time.Since(startTime).Seconds()) 2705 2706 if len(sourceTable.Schema) != 0 && !s.cfg.EnableGTID { 2707 // when in position-based replication, now events before table checkpoint is sent to queue. But in GTID-based 2708 // replication events may share one GTID, so event whose end position is equal to table checkpoint may not be 2709 // sent to queue. We may need event index in GTID to resolve it. 2710 // when in gtid-based replication, we can't save gtid to table point because this transaction may not 2711 // have been fully replicated to downstream. So we don't save table point here but record the affected tables and returns 2712 s.saveTablePoint(sourceTable, ec.endLocation) 2713 } 2714 2715 failpoint.Inject("flushFirstJob", func() { 2716 if waitJobsDoneForTest { 2717 s.tctx.L().Info("trigger flushFirstJob") 2718 waitJobsDoneForTest = false 2719 2720 err2 := s.flushJobs() 2721 if err2 != nil { 2722 s.tctx.L().DPanic("flush checkpoint failed", zap.Error(err2)) 2723 } 2724 failpoint.Return(nil, nil) 2725 } 2726 }) 2727 2728 return sourceTable, s.flushIfOutdated() 2729 } 2730 2731 type queryEventContext struct { 2732 *eventContext 2733 2734 p *parser.Parser // used parser 2735 ddlSchema string // used schema 2736 originSQL string // before split 2737 // split multi-schema change DDL into multiple one schema change DDL due to TiDB's limitation 2738 splitDDLs []string // after split before online ddl 2739 appliedDDLs []string // after onlineDDL apply if onlineDDL != nil 2740 needHandleDDLs []string // after route 2741 2742 shardingDDLInfo *ddlInfo 2743 trackInfos []*ddlInfo 2744 sourceTbls map[string]map[string]struct{} // db name -> tb name 2745 onlineDDLTable *filter.Table 2746 eventStatusVars []byte // binlog StatusVars 2747 timestamp uint32 2748 timezone string 2749 } 2750 2751 func (qec *queryEventContext) String() string { 2752 startLocation := qec.startLocation.String() 2753 currentLocation := qec.endLocation.String() 2754 lastLocation := qec.lastLocation.String() 2755 var needHandleDDLs, shardingReSync string 2756 if qec.needHandleDDLs != nil { 2757 needHandleDDLs = strings.Join(qec.needHandleDDLs, ",") 2758 } 2759 if qec.shardingReSync != nil { 2760 shardingReSync = qec.shardingReSync.String() 2761 } 2762 trackInfos := make([]string, 0, len(qec.trackInfos)) 2763 for _, trackInfo := range qec.trackInfos { 2764 trackInfos = append(trackInfos, trackInfo.String()) 2765 } 2766 return fmt.Sprintf("{schema: %s, originSQL: %s, startLocation: %s, endLocation: %s, lastLocation: %s, re-sync: %s, needHandleDDLs: %s, trackInfos: %s}", 2767 qec.ddlSchema, qec.originSQL, startLocation, currentLocation, lastLocation, shardingReSync, needHandleDDLs, strings.Join(trackInfos, ",")) 2768 } 2769 2770 // generateExtendColumn generate extended columns by extractor. 2771 func generateExtendColumn(data [][]interface{}, r *regexprrouter.RouteTable, table *filter.Table, sourceID string) [][]interface{} { 2772 extendCol, extendVal := r.FetchExtendColumn(table.Schema, table.Name, sourceID) 2773 if len(extendCol) == 0 { 2774 return nil 2775 } 2776 2777 rows := make([][]interface{}, len(data)) 2778 for i := range data { 2779 rows[i] = data[i] 2780 for _, v := range extendVal { 2781 rows[i] = append(rows[i], v) 2782 } 2783 } 2784 return rows 2785 } 2786 2787 // trackDDL tracks ddl in schemaTracker. 2788 func (s *Syncer) trackDDL(usedSchema string, trackInfo *ddlInfo, ec *eventContext) error { 2789 var ( 2790 srcTables = trackInfo.sourceTables 2791 targetTables = trackInfo.targetTables 2792 srcTable = srcTables[0] 2793 ) 2794 2795 // Make sure the needed tables are all loaded into the schema tracker. 2796 var ( 2797 shouldExecDDLOnSchemaTracker bool 2798 shouldSchemaExist bool 2799 shouldTableExistNum int // tableNames[:shouldTableExistNum] should exist 2800 shouldRefTableExistNum int // tableNames[1:shouldTableExistNum] should exist, since first one is "caller table" 2801 shouldReTrackDownstreamIndex bool // retrack downstreamIndex 2802 ) 2803 2804 switch node := trackInfo.stmtCache.(type) { 2805 case *ast.CreateDatabaseStmt: 2806 shouldExecDDLOnSchemaTracker = true 2807 case *ast.AlterDatabaseStmt: 2808 shouldExecDDLOnSchemaTracker = true 2809 shouldSchemaExist = true 2810 case *ast.DropDatabaseStmt: 2811 shouldExecDDLOnSchemaTracker = true 2812 shouldReTrackDownstreamIndex = true 2813 if s.cfg.ShardMode == "" { 2814 if err := s.checkpoint.DeleteSchemaPoint(ec.tctx, srcTable.Schema); err != nil { 2815 return err 2816 } 2817 } 2818 case *ast.RecoverTableStmt: 2819 shouldExecDDLOnSchemaTracker = true 2820 shouldSchemaExist = true 2821 case *ast.CreateTableStmt, *ast.CreateViewStmt: 2822 shouldExecDDLOnSchemaTracker = true 2823 shouldSchemaExist = true 2824 // for CREATE TABLE LIKE/AS, the reference tables should exist 2825 shouldRefTableExistNum = len(srcTables) 2826 case *ast.DropTableStmt: 2827 shouldExecDDLOnSchemaTracker = true 2828 shouldReTrackDownstreamIndex = true 2829 if err := s.checkpoint.DeleteTablePoint(ec.tctx, srcTable); err != nil { 2830 return err 2831 } 2832 case *ast.RenameTableStmt, *ast.CreateIndexStmt, *ast.DropIndexStmt, *ast.RepairTableStmt: 2833 shouldExecDDLOnSchemaTracker = true 2834 shouldSchemaExist = true 2835 shouldTableExistNum = 1 2836 shouldReTrackDownstreamIndex = true 2837 case *ast.AlterTableStmt: 2838 shouldSchemaExist = true 2839 shouldReTrackDownstreamIndex = true 2840 // for DDL that adds FK, since TiDB doesn't fully support it yet, we simply ignore execution of this DDL. 2841 switch { 2842 case len(node.Specs) == 1 && node.Specs[0].Constraint != nil && node.Specs[0].Constraint.Tp == ast.ConstraintForeignKey: 2843 shouldTableExistNum = 1 2844 shouldExecDDLOnSchemaTracker = false 2845 case node.Specs[0].Tp == ast.AlterTableRenameTable: 2846 shouldTableExistNum = 1 2847 shouldExecDDLOnSchemaTracker = true 2848 default: 2849 shouldTableExistNum = len(srcTables) 2850 shouldExecDDLOnSchemaTracker = true 2851 } 2852 case *ast.LockTablesStmt, *ast.UnlockTablesStmt, *ast.CleanupTableLockStmt, *ast.TruncateTableStmt: 2853 break 2854 default: 2855 ec.tctx.L().DPanic("unhandled DDL type cannot be tracked", zap.Stringer("type", reflect.TypeOf(trackInfo.stmtCache))) 2856 } 2857 2858 if shouldReTrackDownstreamIndex { 2859 s.schemaTracker.RemoveDownstreamSchema(s.tctx, targetTables) 2860 } 2861 2862 if shouldSchemaExist { 2863 if err := s.schemaTracker.CreateSchemaIfNotExists(srcTable.Schema); err != nil { 2864 return terror.ErrSchemaTrackerCannotCreateSchema.Delegate(err, srcTable.Schema) 2865 } 2866 } 2867 for i := 0; i < shouldTableExistNum; i++ { 2868 if _, err := s.getTableInfo(ec.tctx, srcTables[i], targetTables[i]); err != nil { 2869 return err 2870 } 2871 } 2872 // skip getTable before in above loop 2873 start := 1 2874 if shouldTableExistNum > start { 2875 start = shouldTableExistNum 2876 } 2877 for i := start; i < shouldRefTableExistNum; i++ { 2878 if err := s.schemaTracker.CreateSchemaIfNotExists(srcTables[i].Schema); err != nil { 2879 return terror.ErrSchemaTrackerCannotCreateSchema.Delegate(err, srcTables[i].Schema) 2880 } 2881 if _, err := s.getTableInfo(ec.tctx, srcTables[i], targetTables[i]); err != nil { 2882 return err 2883 } 2884 } 2885 2886 if shouldExecDDLOnSchemaTracker { 2887 if err := s.schemaTracker.Exec(ec.tctx.Ctx, usedSchema, trackInfo.originStmt); err != nil { 2888 if ignoreTrackerDDLError(err) { 2889 ec.tctx.L().Warn("will ignore a DDL error when tracking", 2890 zap.String("schema", usedSchema), 2891 zap.String("statement", trackInfo.originDDL), 2892 log.WrapStringerField("location", ec.endLocation), 2893 log.ShortError(err)) 2894 return nil 2895 } 2896 ec.tctx.L().Error("cannot track DDL", 2897 zap.String("schema", usedSchema), 2898 zap.String("statement", trackInfo.originDDL), 2899 log.WrapStringerField("location", ec.endLocation), 2900 log.ShortError(err)) 2901 return terror.ErrSchemaTrackerCannotExecDDL.Delegate(err, trackInfo.originDDL) 2902 } 2903 s.exprFilterGroup.ResetExprs(srcTable) 2904 } 2905 2906 return nil 2907 } 2908 2909 func (s *Syncer) trackOriginDDL(ev *replication.QueryEvent, ec eventContext) (map[string]map[string]struct{}, error) { 2910 originSQL := strings.TrimSpace(string(ev.Query)) 2911 if originSQL == "BEGIN" || originSQL == "COMMIT" || originSQL == "" || utils.IsBuildInSkipDDL(originSQL) { 2912 return nil, nil 2913 } 2914 var err error 2915 qec := &queryEventContext{ 2916 eventContext: &ec, 2917 ddlSchema: string(ev.Schema), 2918 originSQL: utils.TrimCtrlChars(originSQL), 2919 splitDDLs: make([]string, 0), 2920 appliedDDLs: make([]string, 0), 2921 sourceTbls: make(map[string]map[string]struct{}), 2922 eventStatusVars: ev.StatusVars, 2923 } 2924 qec.p, err = event.GetParserForStatusVars(ev.StatusVars) 2925 if err != nil { 2926 s.tctx.L().Warn("found error when get sql_mode from binlog status_vars", zap.Error(err)) 2927 } 2928 stmt, err := parseOneStmt(qec) 2929 if err != nil { 2930 // originSQL can't be parsed => can't be tracked by schema tracker 2931 // we can use operate-schema to set a compatible schema after this 2932 return nil, err 2933 } 2934 2935 if _, ok := stmt.(ast.DDLNode); !ok { 2936 return nil, nil 2937 } 2938 2939 // TiDB can't handle multi schema change DDL, so we split it here. 2940 qec.splitDDLs, err = parserpkg.SplitDDL(stmt, qec.ddlSchema) 2941 if err != nil { 2942 return nil, err 2943 } 2944 2945 for _, sql := range qec.splitDDLs { 2946 sqls, err := s.ddlWorker.processOneDDL(qec, sql) 2947 if err != nil { 2948 s.tctx.L().Warn("processOneDDL failed", zap.Error(err)) 2949 qec.appliedDDLs = append(qec.appliedDDLs, sql) 2950 } else { 2951 qec.appliedDDLs = append(qec.appliedDDLs, sqls...) 2952 } 2953 } 2954 2955 affectedTbls := make(map[string]map[string]struct{}) 2956 for _, sql := range qec.appliedDDLs { 2957 ddlInfo, err := s.ddlWorker.genDDLInfo(qec, sql) 2958 if err != nil { 2959 return nil, err 2960 } 2961 sourceTable := ddlInfo.sourceTables[0] 2962 switch ddlInfo.stmtCache.(type) { 2963 case *ast.DropDatabaseStmt: 2964 delete(affectedTbls, sourceTable.Schema) 2965 case *ast.DropTableStmt: 2966 if affectedTable, ok := affectedTbls[sourceTable.Schema]; ok { 2967 delete(affectedTable, sourceTable.Name) 2968 } 2969 default: 2970 if _, ok := affectedTbls[sourceTable.Schema]; !ok { 2971 affectedTbls[sourceTable.Schema] = make(map[string]struct{}) 2972 } 2973 affectedTbls[sourceTable.Schema][sourceTable.Name] = struct{}{} 2974 } 2975 err = s.trackDDL(qec.ddlSchema, ddlInfo, qec.eventContext) 2976 if err != nil { 2977 return nil, err 2978 } 2979 } 2980 2981 return affectedTbls, nil 2982 } 2983 2984 func (s *Syncer) genRouter() error { 2985 s.tableRouter, _ = regexprrouter.NewRegExprRouter(s.cfg.CaseSensitive, []*router.TableRule{}) 2986 for _, rule := range s.cfg.RouteRules { 2987 err := s.tableRouter.AddRule(rule) 2988 if err != nil { 2989 return terror.ErrSyncerUnitGenTableRouter.Delegate(err) 2990 } 2991 } 2992 return nil 2993 } 2994 2995 func (s *Syncer) loadTableStructureFromDump(ctx context.Context) error { 2996 logger := s.tctx.L() 2997 // TODO: delete this check after we support parallel reading the files to improve load speed 2998 if !storage.IsLocalDiskPath(s.cfg.LoaderConfig.Dir) { 2999 logger.Warn("skip load table structure from dump files for non-local-dir loader because it may be slow", zap.String("loaderDir", s.cfg.LoaderConfig.Dir)) 3000 return nil 3001 } 3002 files, err := storage.CollectDirFiles(ctx, s.cfg.LoaderConfig.Dir, nil) 3003 if err != nil { 3004 logger.Warn("fail to get dump files", zap.Error(err)) 3005 return err 3006 } 3007 var dbs, tables []string 3008 var tableFiles [][2]string // [db, filename] 3009 for f := range files { 3010 // TODO: handle db/table name escaped bu dumpling. 3011 if db, ok := utils.GetDBFromDumpFilename(f); ok { 3012 dbs = append(dbs, db) 3013 continue 3014 } 3015 if db, table, ok := utils.GetTableFromDumpFilename(f); ok { 3016 cols, _ := s.tableRouter.FetchExtendColumn(db, table, s.cfg.SourceID) 3017 if len(cols) > 0 { 3018 continue 3019 } 3020 tables = append(tables, dbutil.TableName(db, table)) 3021 tableFiles = append(tableFiles, [2]string{db, f}) 3022 continue 3023 } 3024 } 3025 logger.Info("fetch table structure from dump files", 3026 zap.Strings("database", dbs), 3027 zap.Any("tables", tables)) 3028 for _, db := range dbs { 3029 if err = s.schemaTracker.CreateSchemaIfNotExists(db); err != nil { 3030 return err 3031 } 3032 } 3033 3034 var firstErr error 3035 setFirstErr := func(err error) { 3036 if firstErr == nil { 3037 firstErr = err 3038 } 3039 } 3040 p, err := conn.GetParserFromSQLModeStr(s.cfg.LoaderConfig.SQLMode) 3041 if err != nil { 3042 logger.Error("failed to create parser from SQL Mode, will skip loadTableStructureFromDump", 3043 zap.String("SQLMode", s.cfg.LoaderConfig.SQLMode), 3044 zap.Error(err)) 3045 return err 3046 } 3047 3048 for _, dbAndFile := range tableFiles { 3049 db, file := dbAndFile[0], dbAndFile[1] 3050 content, err2 := storage.ReadFile(ctx, s.cfg.LoaderConfig.Dir, file, nil) 3051 if err2 != nil { 3052 logger.Warn("fail to read file for creating table in schema tracker", 3053 zap.String("db", db), 3054 zap.String("path", s.cfg.LoaderConfig.Dir), 3055 zap.String("file", file), 3056 zap.Error(err2)) 3057 setFirstErr(err2) 3058 continue 3059 } 3060 stmts := bytes.Split(content, []byte(";\n")) 3061 for _, stmt := range stmts { 3062 stmt = bytes.TrimSpace(stmt) 3063 if len(stmt) == 0 || bytes.HasPrefix(stmt, []byte("/*")) { 3064 continue 3065 } 3066 stmtNode, err := p.ParseOneStmt(string(stmt), "", "") 3067 if err != nil { 3068 logger.Warn("fail to parse statement for creating table in schema tracker", 3069 zap.String("db", db), 3070 zap.String("path", s.cfg.LoaderConfig.Dir), 3071 zap.String("file", file), 3072 zap.ByteString("statement", stmt), 3073 zap.Error(err)) 3074 setFirstErr(err) 3075 continue 3076 } 3077 switch v := stmtNode.(type) { 3078 case *ast.SetStmt: 3079 logger.Warn("ignoring statement", 3080 zap.String("type", fmt.Sprintf("%T", v)), 3081 zap.ByteString("statement", stmt)) 3082 case *ast.CreateTableStmt: 3083 err = s.schemaTracker.Exec(ctx, db, stmtNode) 3084 if err != nil { 3085 logger.Warn("fail to create table for dump files", 3086 zap.Any("path", s.cfg.LoaderConfig.Dir), 3087 zap.Any("file", file), 3088 zap.ByteString("statement", stmt), 3089 zap.Error(err)) 3090 setFirstErr(err) 3091 continue 3092 } 3093 s.saveTablePoint( 3094 &filter.Table{Schema: db, Name: v.Table.Name.O}, 3095 s.getFlushedGlobalPoint(), 3096 ) 3097 default: 3098 err = s.schemaTracker.Exec(ctx, db, stmtNode) 3099 if err != nil { 3100 logger.Warn("fail to create table for dump files", 3101 zap.Any("path", s.cfg.LoaderConfig.Dir), 3102 zap.Any("file", file), 3103 zap.ByteString("statement", stmt), 3104 zap.Error(err)) 3105 setFirstErr(err) 3106 continue 3107 } 3108 } 3109 } 3110 } 3111 return firstErr 3112 } 3113 3114 func (s *Syncer) createDBs(ctx context.Context) error { 3115 var err error 3116 dbCfg := s.cfg.From 3117 dbCfg.RawDBCfg = dbconfig.DefaultRawDBConfig().SetReadTimeout(maxDMLConnectionTimeout) 3118 fromDB, fromConns, err := dbconn.CreateConns(s.tctx, s.cfg, conn.UpstreamDBConfig(&dbCfg), 1, s.cfg.DumpIOTotalBytes, s.cfg.DumpUUID) 3119 if err != nil { 3120 return err 3121 } 3122 s.fromDB = &dbconn.UpStreamConn{BaseDB: fromDB} 3123 s.fromConn = fromConns[0] 3124 baseConn, err := s.fromDB.BaseDB.GetBaseConn(ctx) 3125 if err != nil { 3126 return err 3127 } 3128 lcFlavor, err := conn.FetchLowerCaseTableNamesSetting(ctx, baseConn) 3129 if err != nil { 3130 return err 3131 } 3132 s.SourceTableNamesFlavor = lcFlavor 3133 3134 hasSQLMode := false 3135 // get sql_mode from upstream db 3136 if s.cfg.To.Session == nil { 3137 s.cfg.To.Session = make(map[string]string) 3138 } else { 3139 for k := range s.cfg.To.Session { 3140 if strings.ToLower(k) == "sql_mode" { 3141 hasSQLMode = true 3142 break 3143 } 3144 } 3145 } 3146 if !hasSQLMode { 3147 sqlMode, err2 := conn.GetGlobalVariable(tcontext.NewContext(ctx, log.L()), s.fromDB.BaseDB, "sql_mode") 3148 if err2 != nil { 3149 s.tctx.L().Warn("cannot get sql_mode from upstream database, the sql_mode will be assigned \"IGNORE_SPACE, NO_AUTO_VALUE_ON_ZERO, ALLOW_INVALID_DATES\"", log.ShortError(err2)) 3150 } 3151 sqlModes, err3 := conn.AdjustSQLModeCompatible(sqlMode) 3152 if err3 != nil { 3153 s.tctx.L().Warn("cannot adjust sql_mode compatible, the sql_mode will be assigned stay the same", log.ShortError(err3)) 3154 } 3155 s.cfg.To.Session["sql_mode"] = sqlModes 3156 } 3157 3158 dbCfg = s.cfg.To 3159 dbCfg.RawDBCfg = dbconfig.DefaultRawDBConfig(). 3160 SetReadTimeout(maxDMLConnectionTimeout). 3161 SetMaxIdleConns(s.cfg.WorkerCount) 3162 3163 s.toDB, s.toDBConns, err = dbconn.CreateConns(s.tctx, s.cfg, conn.DownstreamDBConfig(&dbCfg), s.cfg.WorkerCount, s.cfg.IOTotalBytes, s.cfg.UUID) 3164 if err != nil { 3165 dbconn.CloseUpstreamConn(s.tctx, s.fromDB) // release resources acquired before return with error 3166 return err 3167 } 3168 // baseConn for ddl 3169 dbCfg = s.cfg.To 3170 dbCfg.RawDBCfg = dbconfig.DefaultRawDBConfig().SetReadTimeout(maxDDLConnectionTimeout) 3171 3172 var ddlDBConns []*dbconn.DBConn 3173 s.ddlDB, ddlDBConns, err = dbconn.CreateConns(s.tctx, s.cfg, conn.DownstreamDBConfig(&dbCfg), 2, s.cfg.IOTotalBytes, s.cfg.UUID) 3174 if err != nil { 3175 dbconn.CloseUpstreamConn(s.tctx, s.fromDB) 3176 dbconn.CloseBaseDB(s.tctx, s.toDB) 3177 return err 3178 } 3179 s.ddlDBConn = ddlDBConns[0] 3180 s.downstreamTrackConn = ddlDBConns[1] 3181 printServerVersion(s.tctx, s.fromDB.BaseDB, "upstream") 3182 printServerVersion(s.tctx, s.toDB, "downstream") 3183 3184 return nil 3185 } 3186 3187 // closeBaseDB closes all opened DBs, rollback for createConns. 3188 func (s *Syncer) closeDBs() { 3189 dbconn.CloseUpstreamConn(s.tctx, s.fromDB) 3190 dbconn.CloseBaseDB(s.tctx, s.toDB) 3191 dbconn.CloseBaseDB(s.tctx, s.ddlDB) 3192 } 3193 3194 // record skip ddl/dml sqls' position 3195 // make newJob's sql argument empty to distinguish normal sql and skips sql. 3196 func (s *Syncer) recordSkipSQLsLocation(ec *eventContext) error { 3197 job := newSkipJob(ec) 3198 _, err := s.handleJobFunc(job) 3199 return err 3200 } 3201 3202 // flushJobs add a flush job and wait for all jobs finished. 3203 // NOTE: currently, flush job is always sync operation. 3204 func (s *Syncer) flushJobs() error { 3205 flushJobSeq := s.getFlushSeq() 3206 s.tctx.L().Info("flush all jobs", zap.Stringer("global checkpoint", s.checkpoint), zap.Int64("flush job seq", flushJobSeq)) 3207 job := newFlushJob(s.cfg.WorkerCount, flushJobSeq) 3208 _, err := s.handleJobFunc(job) 3209 return err 3210 } 3211 3212 func (s *Syncer) route(table *filter.Table) *filter.Table { 3213 return route(s.tableRouter, table) 3214 } 3215 3216 func route(tableRouter *regexprrouter.RouteTable, table *filter.Table) *filter.Table { 3217 if table.Schema == "" { 3218 return table 3219 } 3220 // nolint:errcheck 3221 targetSchema, targetTable, _ := tableRouter.Route(table.Schema, table.Name) 3222 if targetSchema == "" { 3223 return table 3224 } 3225 if targetTable == "" { 3226 targetTable = table.Name 3227 } 3228 3229 return &filter.Table{Schema: targetSchema, Name: targetTable} 3230 } 3231 3232 func (s *Syncer) IsRunning() bool { 3233 return s.running.Load() 3234 } 3235 3236 func (s *Syncer) isClosed() bool { 3237 return s.closed.Load() 3238 } 3239 3240 // Close closes syncer. 3241 func (s *Syncer) Close() { 3242 s.Lock() 3243 defer s.Unlock() 3244 3245 if s.isClosed() { 3246 return 3247 } 3248 s.stopSync() 3249 s.closeDBs() 3250 s.checkpoint.Close() 3251 s.schemaTracker.Close() 3252 if s.sgk != nil { 3253 s.sgk.Close() 3254 } 3255 s.closeOnlineDDL() 3256 // when closing syncer by `stop-task`, remove active relay log from hub 3257 s.removeActiveRelayLog() 3258 s.metricsProxies.RemoveLabelValuesWithTaskInMetrics(s.cfg.Name) 3259 3260 s.runWg.Wait() 3261 s.closed.Store(true) 3262 } 3263 3264 // Kill kill syncer without graceful. 3265 func (s *Syncer) Kill() { 3266 s.tctx.L().Warn("kill syncer without graceful") 3267 s.runCancel() 3268 s.syncCancel() 3269 s.Close() 3270 } 3271 3272 // stopSync stops stream and rollbacks checkpoint. Now it's used by Close() and Pause(). 3273 func (s *Syncer) stopSync() { 3274 // before re-write workflow for s.syncer, simply close it 3275 // when resuming, re-create s.syncer 3276 3277 if s.streamerController != nil { 3278 s.streamerController.Close() 3279 } 3280 3281 // try to rollback checkpoints, if they already flushed, no effect, this operation should call before close schemaTracker 3282 prePos := s.checkpoint.GlobalPoint() 3283 s.checkpoint.Rollback() 3284 currPos := s.checkpoint.GlobalPoint() 3285 if binlog.CompareLocation(prePos, currPos, s.cfg.EnableGTID) != 0 { 3286 s.tctx.L().Warn("rollback global checkpoint", zap.Stringer("previous position", prePos), zap.Stringer("current position", currPos)) 3287 } 3288 } 3289 3290 func (s *Syncer) closeOnlineDDL() { 3291 if s.onlineDDL != nil { 3292 s.onlineDDL.Close() 3293 s.onlineDDL = nil 3294 } 3295 } 3296 3297 // Pause implements Unit.Pause. 3298 func (s *Syncer) Pause() { 3299 if s.isClosed() { 3300 s.tctx.L().Warn("try to pause, but already closed") 3301 return 3302 } 3303 s.stopSync() 3304 s.schemaTracker.Close() 3305 } 3306 3307 // Resume resumes the paused process. 3308 func (s *Syncer) Resume(ctx context.Context, pr chan pb.ProcessResult) { 3309 var err error 3310 defer func() { 3311 if err != nil { 3312 processError := unit.NewProcessError(err) 3313 s.handleExitErrMetric(processError) 3314 pr <- pb.ProcessResult{ 3315 IsCanceled: false, 3316 Errors: []*pb.ProcessError{ 3317 processError, 3318 }, 3319 } 3320 } 3321 }() 3322 if s.isClosed() { 3323 s.tctx.L().Warn("try to resume, but already closed") 3324 return 3325 } 3326 3327 // continue the processing 3328 s.reset() 3329 // reset database conns 3330 err = s.resetDBs(s.tctx.WithContext(ctx)) 3331 if err != nil { 3332 return 3333 } 3334 3335 s.Process(ctx, pr) 3336 } 3337 3338 // CheckCanUpdateCfg check if task config can be updated. 3339 // 1. task must not in a pessimistic ddl state. 3340 // 2. only balist, route/filter rules and syncerConfig can be updated at this moment. 3341 // 3. some config fields from sourceCfg also can be updated, see more in func `copyConfigFromSource`. 3342 func (s *Syncer) CheckCanUpdateCfg(newCfg *config.SubTaskConfig) error { 3343 s.RLock() 3344 defer s.RUnlock() 3345 // can't update when in sharding merge 3346 if s.cfg.ShardMode == config.ShardPessimistic { 3347 _, tables := s.sgk.UnresolvedTables() 3348 if len(tables) > 0 { 3349 return terror.ErrSyncerUnitUpdateConfigInSharding.Generate(tables) 3350 } 3351 } 3352 3353 oldCfg, err := s.cfg.Clone() 3354 if err != nil { 3355 return err 3356 } 3357 oldCfg.BAList = newCfg.BAList 3358 oldCfg.BWList = newCfg.BWList 3359 oldCfg.RouteRules = newCfg.RouteRules 3360 oldCfg.FilterRules = newCfg.FilterRules 3361 oldCfg.SyncerConfig = newCfg.SyncerConfig 3362 oldCfg.To.Session = newCfg.To.Session // session is adjusted in `createDBs` 3363 3364 // support fields that changed in func `copyConfigFromSource` 3365 oldCfg.From = newCfg.From 3366 oldCfg.Flavor = newCfg.Flavor 3367 oldCfg.ServerID = newCfg.ServerID 3368 oldCfg.RelayDir = newCfg.RelayDir 3369 oldCfg.UseRelay = newCfg.UseRelay 3370 oldCfg.EnableGTID = newCfg.EnableGTID 3371 oldCfg.CaseSensitive = newCfg.CaseSensitive 3372 3373 if oldCfg.String() != newCfg.String() { 3374 s.tctx.L().Warn("can not update cfg", zap.Stringer("old cfg", oldCfg), zap.Stringer("new cfg", newCfg)) 3375 return terror.ErrWorkerUpdateSubTaskConfig.Generatef("can't update subtask config for syncer because new config contains some fields that should not be changed, task: %s", s.cfg.Name) 3376 } 3377 return nil 3378 } 3379 3380 // Update implements Unit.Update 3381 // now, only support to update config for routes, filters, column-mappings, block-allow-list 3382 // now no config diff implemented, so simply re-init use new config. 3383 func (s *Syncer) Update(ctx context.Context, cfg *config.SubTaskConfig) error { 3384 s.Lock() 3385 defer s.Unlock() 3386 if s.cfg.ShardMode == config.ShardPessimistic { 3387 _, tables := s.sgk.UnresolvedTables() 3388 if len(tables) > 0 { 3389 return terror.ErrSyncerUnitUpdateConfigInSharding.Generate(tables) 3390 } 3391 } 3392 3393 var ( 3394 err error 3395 oldBaList *filter.Filter 3396 oldTableRouter *regexprrouter.RouteTable 3397 oldBinlogFilter *bf.BinlogEvent 3398 ) 3399 3400 defer func() { 3401 if err == nil { 3402 return 3403 } 3404 if oldBaList != nil { 3405 s.baList = oldBaList 3406 } 3407 if oldTableRouter != nil { 3408 s.tableRouter = oldTableRouter 3409 } 3410 if oldBinlogFilter != nil { 3411 s.binlogFilter = oldBinlogFilter 3412 } 3413 }() 3414 3415 // update block-allow-list 3416 oldBaList = s.baList 3417 s.baList, err = filter.New(cfg.CaseSensitive, cfg.BAList) 3418 if err != nil { 3419 return terror.ErrSyncerUnitGenBAList.Delegate(err) 3420 } 3421 3422 // update route 3423 oldTableRouter = s.tableRouter 3424 s.tableRouter, err = regexprrouter.NewRegExprRouter(cfg.CaseSensitive, cfg.RouteRules) 3425 if err != nil { 3426 return terror.ErrSyncerUnitGenTableRouter.Delegate(err) 3427 } 3428 3429 // update binlog filter 3430 oldBinlogFilter = s.binlogFilter 3431 s.binlogFilter, err = bf.NewBinlogEvent(cfg.CaseSensitive, cfg.FilterRules) 3432 if err != nil { 3433 return terror.ErrSyncerUnitGenBinlogEventFilter.Delegate(err) 3434 } 3435 3436 switch s.cfg.ShardMode { 3437 case config.ShardPessimistic: 3438 // re-init sharding group 3439 err = s.sgk.Init() 3440 if err != nil { 3441 return err 3442 } 3443 3444 err = s.initShardingGroups(context.Background(), false) // FIXME: fix context when re-implementing `Update` 3445 if err != nil { 3446 return err 3447 } 3448 case config.ShardOptimistic: 3449 err = s.initOptimisticShardDDL(context.Background()) // FIXME: fix context when re-implementing `Update` 3450 if err != nil { 3451 return err 3452 } 3453 } 3454 3455 // update l.cfg 3456 s.cfg.BAList = cfg.BAList 3457 s.cfg.RouteRules = cfg.RouteRules 3458 s.cfg.FilterRules = cfg.FilterRules 3459 s.cfg.ColumnMappingRules = cfg.ColumnMappingRules 3460 3461 // update timezone 3462 if s.timezone == nil { 3463 s.timezone, _, err = str2TimezoneOrFromDB(s.tctx.WithContext(ctx), s.cfg.Timezone, conn.DownstreamDBConfig(&s.cfg.To)) 3464 return err 3465 } 3466 // update syncer config 3467 s.cfg.SyncerConfig = cfg.SyncerConfig 3468 3469 // updated fileds that changed in func `copyConfigFromSource` 3470 s.cfg.From = cfg.From 3471 s.cfg.Flavor = cfg.Flavor 3472 s.cfg.ServerID = cfg.ServerID 3473 s.cfg.RelayDir = cfg.RelayDir 3474 s.cfg.UseRelay = cfg.UseRelay 3475 s.cfg.EnableGTID = cfg.EnableGTID 3476 s.cfg.CaseSensitive = cfg.CaseSensitive 3477 return nil 3478 } 3479 3480 // checkpointID returns ID which used for checkpoint table. 3481 func (s *Syncer) checkpointID() string { 3482 if len(s.cfg.SourceID) > 0 { 3483 return s.cfg.SourceID 3484 } 3485 return strconv.FormatUint(uint64(s.cfg.ServerID), 10) 3486 } 3487 3488 // ShardDDLOperation returns the current pending to handle shard DDL lock operation. 3489 func (s *Syncer) ShardDDLOperation() *pessimism.Operation { 3490 return s.pessimist.PendingOperation() 3491 } 3492 3493 func (s *Syncer) setErrLocation(startLocation, endLocation *binlog.Location, isQueryEventEvent bool) { 3494 s.errLocation.Lock() 3495 defer s.errLocation.Unlock() 3496 3497 s.errLocation.isQueryEvent = isQueryEventEvent 3498 if s.errLocation.startLocation == nil || startLocation == nil { 3499 s.errLocation.startLocation = startLocation 3500 } else if binlog.CompareLocation(*startLocation, *s.errLocation.startLocation, s.cfg.EnableGTID) < 0 { 3501 s.errLocation.startLocation = startLocation 3502 } 3503 3504 if s.errLocation.endLocation == nil || endLocation == nil { 3505 s.errLocation.endLocation = endLocation 3506 } else if binlog.CompareLocation(*endLocation, *s.errLocation.endLocation, s.cfg.EnableGTID) < 0 { 3507 s.errLocation.endLocation = endLocation 3508 } 3509 } 3510 3511 func (s *Syncer) getErrLocation() (*binlog.Location, bool) { 3512 s.errLocation.Lock() 3513 defer s.errLocation.Unlock() 3514 return s.errLocation.startLocation, s.errLocation.isQueryEvent 3515 } 3516 3517 func (s *Syncer) handleEventError(err error, startLocation, endLocation binlog.Location, isQueryEvent bool, originSQL string) error { 3518 if err == nil { 3519 return nil 3520 } 3521 s.setErrLocation(&startLocation, &endLocation, isQueryEvent) 3522 if len(originSQL) > 0 { 3523 return terror.Annotatef(err, "startLocation: [%s], endLocation: [%s], origin SQL: [%s]", startLocation, endLocation, originSQL) 3524 } 3525 return terror.Annotatef(err, "startLocation: [%s], endLocation: [%s]", startLocation, endLocation) 3526 } 3527 3528 func (s *Syncer) adjustGlobalPointGTID(tctx *tcontext.Context) (bool, error) { 3529 location := s.checkpoint.GlobalPoint() 3530 // situations that don't need to adjust 3531 // 1. GTID is not enabled 3532 // 2. location already has GTID position 3533 // 3. location is totally new, has no position info 3534 // 4. location is too early thus not a COMMIT location, which happens when it's reset by other logic 3535 if !s.cfg.EnableGTID || !gtid.CheckGTIDSetEmpty(location.GetGTID()) || location.Position.Name == "" || location.Position.Pos == 4 { 3536 return false, nil 3537 } 3538 // set enableGTID to false for new streamerController 3539 streamerController := binlogstream.NewStreamerController( 3540 s.syncCfg, 3541 false, 3542 s.fromDB, 3543 s.cfg.RelayDir, 3544 s.timezone, 3545 s.relay, 3546 s.tctx.L(), 3547 ) 3548 3549 endPos := binlog.RemoveRelaySubDirSuffix(location.Position) 3550 startPos := mysql.Position{ 3551 Name: endPos.Name, 3552 Pos: 0, 3553 } 3554 startLocation := location.Clone() 3555 startLocation.Position = startPos 3556 3557 err := streamerController.Start(tctx, startLocation) 3558 if err != nil { 3559 return false, err 3560 } 3561 defer streamerController.Close() 3562 3563 gs, err := reader.GetGTIDsForPosFromStreamer(tctx.Context(), streamerController.GetStreamer(), endPos) 3564 if err != nil { 3565 s.tctx.L().Warn("fail to get gtids for global location", zap.Stringer("pos", location), zap.Error(err)) 3566 return false, err 3567 } 3568 dbConn, err := s.fromDB.BaseDB.GetBaseConn(tctx.Context()) 3569 if err != nil { 3570 s.tctx.L().Warn("fail to build connection", zap.Stringer("pos", location), zap.Error(err)) 3571 return false, err 3572 } 3573 gs, err = conn.AddGSetWithPurged(tctx.Context(), gs, dbConn) 3574 if err != nil { 3575 s.tctx.L().Warn("fail to merge purged gtidSet", zap.Stringer("pos", location), zap.Error(err)) 3576 return false, err 3577 } 3578 err = location.SetGTID(gs) 3579 if err != nil { 3580 s.tctx.L().Warn("fail to set gtid for global location", zap.Stringer("pos", location), 3581 zap.String("adjusted_gtid", gs.String()), zap.Error(err)) 3582 return false, err 3583 } 3584 s.saveGlobalPoint(location) 3585 // redirect streamer for new gtid set location 3586 err = s.streamerController.ResetReplicationSyncer(tctx, location) 3587 if err != nil { 3588 s.tctx.L().Warn("fail to redirect streamer for global location", zap.Stringer("pos", location), 3589 zap.String("adjusted_gtid", gs.String()), zap.Error(err)) 3590 return false, err 3591 } 3592 return true, nil 3593 } 3594 3595 // delLoadTask is called when finish restoring data, to delete load worker in etcd. 3596 func (s *Syncer) delLoadTask() error { 3597 if s.cli == nil { 3598 return nil 3599 } 3600 _, _, err := ha.DelLoadTask(s.cli, s.cfg.Name, s.cfg.SourceID) 3601 if err != nil { 3602 return err 3603 } 3604 s.tctx.Logger.Info("delete load worker in etcd for all mode", zap.String("task", s.cfg.Name), zap.String("source", s.cfg.SourceID)) 3605 return nil 3606 } 3607 3608 // DM originally cached s.cfg.QueueSize * s.cfg.WorkerCount dml jobs in memory in 2.0.X. 3609 // Now if compact: false, dmlJobCh and dmlWorker will both cached s.cfg.QueueSize * s.cfg.WorkerCount/2 jobs. 3610 // If compact: true, dmlJobCh, compactor buffer, compactor output channel and dmlWorker will all cached s.cfg.QueueSize * s.cfg.WorkerCount/4 jobs. 3611 func calculateChanSize(queueSize, workerCount int, compact bool) int { 3612 chanSize := queueSize * workerCount / 2 3613 if compact { 3614 chanSize /= 2 3615 } 3616 return chanSize 3617 } 3618 3619 func (s *Syncer) setGlobalPointByTime(tctx *tcontext.Context, timeStr string) error { 3620 t, err := utils.ParseStartTimeInLoc(timeStr, s.upstreamTZ) 3621 if err != nil { 3622 return err 3623 } 3624 3625 var ( 3626 loc *binlog.Location 3627 posTp binlog.PosType 3628 ) 3629 3630 if s.relay != nil { 3631 subDir := s.relay.Status(nil).(*pb.RelayStatus).RelaySubDir 3632 relayDir := path.Join(s.cfg.RelayDir, subDir) 3633 finder := binlog.NewLocalBinlogPosFinder(tctx, s.cfg.EnableGTID, s.cfg.Flavor, relayDir) 3634 loc, posTp, err = finder.FindByTimestamp(t.Unix()) 3635 } else { 3636 finder := binlog.NewRemoteBinlogPosFinder(tctx, s.fromDB.BaseDB, s.syncCfg, s.cfg.EnableGTID) 3637 loc, posTp, err = finder.FindByTimestamp(t.Unix()) 3638 } 3639 if err != nil { 3640 s.tctx.L().Error("fail to find binlog position by timestamp", 3641 zap.Time("time", t), 3642 zap.Error(err)) 3643 return err 3644 } 3645 3646 switch posTp { 3647 case binlog.InRangeBinlogPos: 3648 s.tctx.L().Info("find binlog position by timestamp", 3649 zap.String("time", timeStr), 3650 zap.Stringer("pos", loc)) 3651 case binlog.BelowLowerBoundBinlogPos: 3652 s.tctx.L().Warn("fail to find binlog location by timestamp because the timestamp is too early, will use the earliest binlog location", 3653 zap.String("time", timeStr), 3654 zap.Any("location", loc)) 3655 case binlog.AboveUpperBoundBinlogPos: 3656 return terror.ErrConfigStartTimeTooLate.Generate(timeStr) 3657 } 3658 3659 err = s.checkpoint.DeleteAllTablePoint(tctx) 3660 if err != nil { 3661 return err 3662 } 3663 s.checkpoint.SaveGlobalPointForcibly(*loc) 3664 s.tctx.L().Info("Will replicate from the specified time, the location recorded in checkpoint and config file will be ignored", 3665 zap.String("time", timeStr), 3666 zap.Any("locationOfTheTime", loc)) 3667 return nil 3668 } 3669 3670 func (s *Syncer) getFlushedGlobalPoint() binlog.Location { 3671 return s.checkpoint.FlushedGlobalPoint() 3672 } 3673 3674 func (s *Syncer) getInitExecutedLoc() binlog.Location { 3675 s.RLock() 3676 defer s.RUnlock() 3677 return s.initExecutedLoc.Clone() 3678 } 3679 3680 func (s *Syncer) initInitExecutedLoc() { 3681 s.Lock() 3682 defer s.Unlock() 3683 if s.initExecutedLoc == nil { 3684 p := s.checkpoint.GlobalPoint() 3685 s.initExecutedLoc = &p 3686 } 3687 } 3688 3689 func (s *Syncer) getTrackedTableInfo(table *filter.Table) (*model.TableInfo, error) { 3690 return s.schemaTracker.GetTableInfo(table) 3691 } 3692 3693 func (s *Syncer) getDownStreamTableInfo(tctx *tcontext.Context, tableID string, originTI *model.TableInfo) (*schema.DownstreamTableInfo, error) { 3694 return s.schemaTracker.GetDownStreamTableInfo(tctx, tableID, originTI) 3695 } 3696 3697 func (s *Syncer) getTableInfoFromCheckpoint(table *filter.Table) *model.TableInfo { 3698 return s.checkpoint.GetTableInfo(table.Schema, table.Name) 3699 }