github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/worker/subtask.go (about) 1 // Copyright 2019 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package worker 15 16 import ( 17 "context" 18 "sync" 19 "time" 20 21 "github.com/go-mysql-org/go-mysql/mysql" 22 "github.com/pingcap/failpoint" 23 "github.com/pingcap/tiflow/dm/config" 24 "github.com/pingcap/tiflow/dm/dumpling" 25 "github.com/pingcap/tiflow/dm/loader" 26 "github.com/pingcap/tiflow/dm/pb" 27 "github.com/pingcap/tiflow/dm/pkg/binlog" 28 "github.com/pingcap/tiflow/dm/pkg/gtid" 29 "github.com/pingcap/tiflow/dm/pkg/log" 30 "github.com/pingcap/tiflow/dm/pkg/shardddl/pessimism" 31 "github.com/pingcap/tiflow/dm/pkg/terror" 32 "github.com/pingcap/tiflow/dm/pkg/utils" 33 "github.com/pingcap/tiflow/dm/relay" 34 "github.com/pingcap/tiflow/dm/syncer" 35 "github.com/pingcap/tiflow/dm/unit" 36 "github.com/prometheus/client_golang/prometheus" 37 clientv3 "go.etcd.io/etcd/client/v3" 38 "go.uber.org/atomic" 39 "go.uber.org/zap" 40 ) 41 42 const ( 43 // the timeout to wait for relay catchup when switching from load unit to sync unit. 44 waitRelayCatchupTimeout = 30 * time.Second 45 ) 46 47 // createRealUnits is subtask units initializer 48 // it can be used for testing. 49 var createUnits = createRealUnits 50 51 // createRealUnits creates process units base on task mode. 52 func createRealUnits(cfg *config.SubTaskConfig, etcdClient *clientv3.Client, workerName string, relay relay.Process) []unit.Unit { 53 failpoint.Inject("mockCreateUnitsDumpOnly", func(_ failpoint.Value) { 54 log.L().Info("create mock worker units with dump unit only", zap.String("failpoint", "mockCreateUnitsDumpOnly")) 55 failpoint.Return([]unit.Unit{dumpling.NewDumpling(cfg)}) 56 }) 57 58 us := make([]unit.Unit, 0, 3) 59 switch cfg.Mode { 60 case config.ModeAll: 61 us = append(us, dumpling.NewDumpling(cfg)) 62 us = append(us, loader.NewLightning(cfg, etcdClient, workerName)) 63 us = append(us, syncer.NewSyncer(cfg, etcdClient, relay)) 64 case config.ModeFull: 65 // NOTE: maybe need another checker in the future? 66 us = append(us, dumpling.NewDumpling(cfg)) 67 us = append(us, loader.NewLightning(cfg, etcdClient, workerName)) 68 case config.ModeIncrement: 69 us = append(us, syncer.NewSyncer(cfg, etcdClient, relay)) 70 case config.ModeDump: 71 us = append(us, dumpling.NewDumpling(cfg)) 72 case config.ModeLoadSync: 73 us = append(us, loader.NewLightning(cfg, etcdClient, workerName)) 74 us = append(us, syncer.NewSyncer(cfg, etcdClient, relay)) 75 default: 76 log.L().Error("unsupported task mode", zap.String("subtask", cfg.Name), zap.String("task mode", cfg.Mode)) 77 } 78 return us 79 } 80 81 // SubTask represents a sub task of data migration. 82 type SubTask struct { 83 cfg *config.SubTaskConfig 84 85 initialized atomic.Bool 86 87 l log.Logger 88 89 sync.RWMutex 90 // ctx is used for the whole subtask. It will be created only when we new a subtask. 91 ctx context.Context 92 cancel context.CancelFunc 93 // currCtx is used for one loop. It will be created each time we use st.run/st.Resume 94 currCtx context.Context 95 currCancel context.CancelFunc 96 97 units []unit.Unit // units do job one by one 98 currUnit unit.Unit 99 prevUnit unit.Unit 100 resultWg sync.WaitGroup 101 102 stage pb.Stage // stage of current sub task 103 result *pb.ProcessResult // the process result, nil when is processing 104 105 etcdClient *clientv3.Client 106 107 workerName string 108 109 validator *syncer.DataValidator 110 } 111 112 // NewSubTask is subtask initializer 113 // it can be used for testing. 114 var NewSubTask = NewRealSubTask 115 116 // NewRealSubTask creates a new SubTask. 117 func NewRealSubTask(cfg *config.SubTaskConfig, etcdClient *clientv3.Client, workerName string) *SubTask { 118 return NewSubTaskWithStage(cfg, pb.Stage_New, etcdClient, workerName) 119 } 120 121 // NewSubTaskWithStage creates a new SubTask with stage. 122 func NewSubTaskWithStage(cfg *config.SubTaskConfig, stage pb.Stage, etcdClient *clientv3.Client, workerName string) *SubTask { 123 ctx, cancel := context.WithCancel(context.Background()) 124 st := SubTask{ 125 cfg: cfg, 126 stage: stage, 127 l: log.With(zap.String("subtask", cfg.Name)), 128 ctx: ctx, 129 cancel: cancel, 130 etcdClient: etcdClient, 131 workerName: workerName, 132 } 133 updateTaskMetric(st.cfg.Name, st.cfg.SourceID, st.stage, st.workerName) 134 return &st 135 } 136 137 // initUnits initializes the sub task processing units. 138 func (st *SubTask) initUnits(relay relay.Process) error { 139 st.units = createUnits(st.cfg, st.etcdClient, st.workerName, relay) 140 if len(st.units) < 1 { 141 return terror.ErrWorkerNoAvailUnits.Generate(st.cfg.Name, st.cfg.Mode) 142 } 143 144 initializeUnitSuccess := true 145 // when error occurred, initialized units should be closed 146 // when continue sub task from loader / syncer, ahead units should be closed 147 var needCloseUnits []unit.Unit 148 defer func() { 149 for _, u := range needCloseUnits { 150 u.Close() 151 } 152 153 st.initialized.Store(initializeUnitSuccess) 154 }() 155 156 // every unit does base initialization in `Init`, and this must pass before start running the sub task 157 // other setups can be done in `Process`, like Loader's prepare which depends on Mydumper's output 158 // but setups in `Process` should be treated carefully, let it's compatible with Pause / Resume 159 for i, u := range st.units { 160 ctx, cancel := context.WithTimeout(context.Background(), unit.DefaultInitTimeout) 161 err := u.Init(ctx) 162 cancel() 163 if err != nil { 164 initializeUnitSuccess = false 165 // when init fail, other units initialized before should be closed 166 for j := 0; j < i; j++ { 167 needCloseUnits = append(needCloseUnits, st.units[j]) 168 } 169 return terror.Annotatef(err, "fail to initialize unit %s of subtask %s ", u.Type(), st.cfg.Name) 170 } 171 } 172 173 // if the sub task ran before, some units may be skipped 174 skipIdx := 0 175 for i := len(st.units) - 1; i > 0; i-- { 176 u := st.units[i] 177 ctx, cancel := context.WithTimeout(context.Background(), unit.DefaultInitTimeout) 178 isFresh, err := u.IsFreshTask(ctx) 179 cancel() 180 if err != nil { 181 initializeUnitSuccess = false 182 return terror.Annotatef(err, "fail to get fresh status of subtask %s %s", st.cfg.Name, u.Type()) 183 } else if !isFresh { 184 skipIdx = i 185 st.l.Info("continue unit", zap.Stringer("unit", u.Type())) 186 break 187 } 188 } 189 190 needCloseUnits = st.units[:skipIdx] 191 st.units = st.units[skipIdx:] 192 193 st.setCurrUnit(st.units[0]) 194 return nil 195 } 196 197 // Run runs the sub task. 198 // TODO: check concurrent problems. 199 func (st *SubTask) Run(expectStage pb.Stage, expectValidatorStage pb.Stage, relay relay.Process) { 200 if st.Stage() == pb.Stage_Finished || st.Stage() == pb.Stage_Running { 201 st.l.Warn("prepare to run a subtask with invalid stage", 202 zap.Stringer("current stage", st.Stage()), 203 zap.Stringer("expected stage", expectStage)) 204 return 205 } 206 207 if err := st.initUnits(relay); err != nil { 208 st.l.Error("fail to initialize subtask", log.ShortError(err)) 209 st.fail(err) 210 return 211 } 212 213 st.StartValidator(expectValidatorStage, true) 214 215 if expectStage == pb.Stage_Running { 216 st.run() 217 } else { 218 // if not want to run, still need to set the stage. 219 st.setStage(expectStage) 220 } 221 } 222 223 func (st *SubTask) run() { 224 st.setStageAndResult(pb.Stage_Running, nil) // clear previous result 225 ctx, cancel := context.WithCancel(st.ctx) 226 st.setCurrCtx(ctx, cancel) 227 err := st.unitTransWaitCondition(ctx) 228 if err != nil { 229 st.l.Error("wait condition", log.ShortError(err)) 230 st.fail(err) 231 return 232 } else if ctx.Err() != nil { 233 st.l.Error("exit SubTask.run", log.ShortError(ctx.Err())) 234 return 235 } 236 237 cu := st.CurrUnit() 238 st.l.Info("start to run", zap.Stringer("unit", cu.Type())) 239 pr := make(chan pb.ProcessResult, 1) 240 st.resultWg.Add(1) 241 go st.fetchResultAndUpdateStage(pr) 242 go cu.Process(ctx, pr) 243 } 244 245 func (st *SubTask) StartValidator(expect pb.Stage, startWithSubtask bool) { 246 // when validator mode=none 247 if expect == pb.Stage_InvalidStage { 248 return 249 } 250 st.Lock() 251 defer st.Unlock() 252 253 if st.cfg.ValidatorCfg.Mode != config.ValidationFast && st.cfg.ValidatorCfg.Mode != config.ValidationFull { 254 return 255 } 256 var syncerObj *syncer.Syncer 257 var ok bool 258 for _, u := range st.units { 259 if syncerObj, ok = u.(*syncer.Syncer); ok { 260 break 261 } 262 } 263 if syncerObj == nil { 264 st.l.Warn("cannot start validator without syncer") 265 return 266 } 267 if st.validator == nil { 268 st.validator = syncer.NewContinuousDataValidator(st.cfg, syncerObj, startWithSubtask) 269 } 270 st.validator.Start(expect) 271 } 272 273 func (st *SubTask) StopValidator() { 274 st.Lock() 275 if st.validator != nil { 276 st.validator.Stop() 277 } 278 st.Unlock() 279 } 280 281 func (st *SubTask) setCurrCtx(ctx context.Context, cancel context.CancelFunc) { 282 st.Lock() 283 // call previous cancel func for safety 284 if st.currCancel != nil { 285 st.currCancel() 286 } 287 st.currCtx = ctx 288 st.currCancel = cancel 289 st.Unlock() 290 } 291 292 func (st *SubTask) callCurrCancel() { 293 st.RLock() 294 st.currCancel() 295 st.RUnlock() 296 } 297 298 // fetchResultAndUpdateStage fetches process result, call Pause of current unit if needed and updates the stage of subtask. 299 func (st *SubTask) fetchResultAndUpdateStage(pr chan pb.ProcessResult) { 300 defer st.resultWg.Done() 301 302 result := <-pr 303 // filter the context canceled error 304 errs := make([]*pb.ProcessError, 0, 2) 305 for _, err := range result.Errors { 306 if !unit.IsCtxCanceledProcessErr(err) { 307 errs = append(errs, err) 308 } 309 } 310 result.Errors = errs 311 312 st.callCurrCancel() // dm-unit finished, canceled or error occurred, always cancel processing 313 314 var ( 315 cu = st.CurrUnit() 316 stage pb.Stage 317 ) 318 319 // update the stage according to result 320 if len(result.Errors) == 0 { 321 switch st.Stage() { 322 case pb.Stage_Pausing: 323 // paused by st.Pause 324 stage = pb.Stage_Paused 325 case pb.Stage_Stopping: 326 // stopped by st.Close 327 stage = pb.Stage_Stopped 328 default: 329 // process finished with no error 330 stage = pb.Stage_Finished 331 } 332 } else { 333 // error occurred, paused 334 stage = pb.Stage_Paused 335 } 336 st.setStageAndResult(stage, &result) 337 338 st.l.Info("unit process returned", zap.Stringer("unit", cu.Type()), zap.Stringer("stage", stage), zap.String("status", st.StatusJSON())) 339 340 switch stage { 341 case pb.Stage_Finished: 342 cu.Close() 343 nu := st.getNextUnit() 344 if nu == nil { 345 // Now, when finished, it only stops the process 346 // if needed, we can refine to Close it 347 st.l.Info("all process units finished") 348 } else { 349 st.l.Info("switching to next unit", zap.Stringer("unit", cu.Type())) 350 st.setCurrUnit(nu) 351 // NOTE: maybe need a Lock mechanism for sharding scenario 352 st.run() // re-run for next process unit 353 } 354 case pb.Stage_Stopped: 355 // the caller will close current unit and more units after it, so we don't call cu.Close here. 356 case pb.Stage_Paused: 357 cu.Pause() 358 for _, err := range result.Errors { 359 st.l.Error("unit process error", zap.Stringer("unit", cu.Type()), zap.Any("error information", err)) 360 } 361 st.l.Info("paused", zap.Stringer("unit", cu.Type())) 362 } 363 } 364 365 // setCurrUnit set current dm unit to ut. 366 func (st *SubTask) setCurrUnit(cu unit.Unit) { 367 st.Lock() 368 defer st.Unlock() 369 pu := st.currUnit 370 st.currUnit = cu 371 st.prevUnit = pu 372 } 373 374 // CurrUnit returns current dm unit. 375 func (st *SubTask) CurrUnit() unit.Unit { 376 st.RLock() 377 defer st.RUnlock() 378 return st.currUnit 379 } 380 381 // PrevUnit returns dm previous unit. 382 func (st *SubTask) PrevUnit() unit.Unit { 383 st.RLock() 384 defer st.RUnlock() 385 return st.prevUnit 386 } 387 388 // closeUnits closes all un-closed units (current unit and all the subsequent units). 389 func (st *SubTask) closeUnits() { 390 st.cancel() 391 st.resultWg.Wait() 392 393 var ( 394 cu = st.currUnit 395 cui = -1 396 ) 397 398 for i, u := range st.units { 399 if u == cu { 400 cui = i 401 break 402 } 403 } 404 if cui < 0 { 405 return 406 } 407 408 for i := cui; i < len(st.units); i++ { 409 u := st.units[i] 410 st.l.Info("closing unit process", zap.Stringer("unit", cu.Type())) 411 u.Close() 412 st.l.Info("closing unit done", zap.Stringer("unit", cu.Type())) 413 } 414 } 415 416 func (st *SubTask) killCurrentUnit() { 417 if st.CurrUnit() != nil { 418 ut := st.CurrUnit().Type() 419 st.l.Info("kill unit", zap.String("task", st.cfg.Name), zap.Stringer("unit", ut)) 420 st.CurrUnit().Kill() 421 st.l.Info("kill unit done", zap.String("task", st.cfg.Name), zap.Stringer("unit", ut)) 422 } 423 } 424 425 // getNextUnit gets the next process unit from st.units 426 // if no next unit, return nil. 427 func (st *SubTask) getNextUnit() unit.Unit { 428 var ( 429 nu unit.Unit 430 cui = len(st.units) 431 cu = st.CurrUnit() 432 ) 433 for i, u := range st.units { 434 if u == cu { 435 cui = i 436 } 437 if i == cui+1 { 438 nu = u 439 break 440 } 441 } 442 return nu 443 } 444 445 func (st *SubTask) setStage(stage pb.Stage) { 446 st.Lock() 447 defer st.Unlock() 448 st.stage = stage 449 updateTaskMetric(st.cfg.Name, st.cfg.SourceID, st.stage, st.workerName) 450 } 451 452 func (st *SubTask) setStageAndResult(stage pb.Stage, result *pb.ProcessResult) { 453 st.Lock() 454 defer st.Unlock() 455 st.stage = stage 456 updateTaskMetric(st.cfg.Name, st.cfg.SourceID, st.stage, st.workerName) 457 st.result = result 458 } 459 460 // stageCAS sets stage to newStage if its current value is oldStage. 461 func (st *SubTask) stageCAS(oldStage, newStage pb.Stage) bool { 462 st.Lock() 463 defer st.Unlock() 464 465 if st.stage == oldStage { 466 st.stage = newStage 467 updateTaskMetric(st.cfg.Name, st.cfg.SourceID, st.stage, st.workerName) 468 return true 469 } 470 return false 471 } 472 473 // setStageIfNotIn sets stage to newStage if its current value is not in oldStages. 474 func (st *SubTask) setStageIfNotIn(oldStages []pb.Stage, newStage pb.Stage) bool { 475 st.Lock() 476 defer st.Unlock() 477 for _, s := range oldStages { 478 if st.stage == s { 479 return false 480 } 481 } 482 st.stage = newStage 483 updateTaskMetric(st.cfg.Name, st.cfg.SourceID, st.stage, st.workerName) 484 return true 485 } 486 487 // setStageIfNotIn sets stage to newStage if its current value is in oldStages. 488 func (st *SubTask) setStageIfIn(oldStages []pb.Stage, newStage pb.Stage) bool { 489 st.Lock() 490 defer st.Unlock() 491 for _, s := range oldStages { 492 if st.stage == s { 493 st.stage = newStage 494 updateTaskMetric(st.cfg.Name, st.cfg.SourceID, st.stage, st.workerName) 495 return true 496 } 497 } 498 return false 499 } 500 501 // Stage returns the stage of the sub task. 502 func (st *SubTask) Stage() pb.Stage { 503 st.RLock() 504 defer st.RUnlock() 505 return st.stage 506 } 507 508 func (st *SubTask) validatorStage() pb.Stage { 509 st.RLock() 510 defer st.RUnlock() 511 if st.validator != nil { 512 return st.validator.Stage() 513 } 514 return pb.Stage_InvalidStage 515 } 516 517 // markResultCanceled mark result as canceled if stage is Paused. 518 // This func is used to pause a task which has been paused by error, 519 // so the task will not auto resume by task checker. 520 func (st *SubTask) markResultCanceled() bool { 521 st.Lock() 522 defer st.Unlock() 523 if st.stage == pb.Stage_Paused { 524 if st.result != nil && !st.result.IsCanceled { 525 st.l.Info("manually pause task which has been paused by errors") 526 st.result.IsCanceled = true 527 return true 528 } 529 } 530 return false 531 } 532 533 // Result returns the result of the sub task. 534 func (st *SubTask) Result() *pb.ProcessResult { 535 st.RLock() 536 defer st.RUnlock() 537 if st.result == nil { 538 return nil 539 } 540 tempProcessResult, _ := st.result.Marshal() 541 newProcessResult := &pb.ProcessResult{} 542 _ = newProcessResult.Unmarshal(tempProcessResult) 543 return newProcessResult 544 } 545 546 // Close stops the sub task. 547 func (st *SubTask) Close() { 548 st.l.Info("closing") 549 if !st.setStageIfNotIn([]pb.Stage{pb.Stage_Stopped, pb.Stage_Stopping, pb.Stage_Finished}, pb.Stage_Stopping) { 550 st.l.Info("subTask is already closed, no need to close") 551 return 552 } 553 st.closeUnits() // close all un-closed units 554 updateTaskMetric(st.cfg.Name, st.cfg.SourceID, pb.Stage_Stopped, st.workerName) 555 556 // we can start/stop validator independent of task, so we don't set st.validator = nil inside 557 st.StopValidator() 558 st.validator = nil 559 } 560 561 // Kill kill running unit and stop the sub task. 562 func (st *SubTask) Kill() { 563 st.l.Info("killing") 564 if !st.setStageIfNotIn([]pb.Stage{pb.Stage_Stopped, pb.Stage_Stopping, pb.Stage_Finished}, pb.Stage_Stopping) { 565 st.l.Info("subTask is already closed, no need to close") 566 return 567 } 568 st.killCurrentUnit() 569 st.closeUnits() // close all un-closed units 570 571 cfg := st.getCfg() 572 updateTaskMetric(cfg.Name, cfg.SourceID, pb.Stage_Stopped, st.workerName) 573 574 st.StopValidator() 575 st.validator = nil 576 } 577 578 // Pause pauses a running subtask or a subtask paused by error. 579 func (st *SubTask) Pause() error { 580 if st.markResultCanceled() { 581 return nil 582 } 583 584 if !st.stageCAS(pb.Stage_Running, pb.Stage_Pausing) { 585 return terror.ErrWorkerNotRunningStage.Generate(st.Stage().String()) 586 } 587 588 st.callCurrCancel() 589 st.resultWg.Wait() // wait fetchResultAndUpdateStage set Pause stage 590 591 return nil 592 } 593 594 // Resume resumes the paused sub task 595 // TODO: similar to Run, refactor later. 596 func (st *SubTask) Resume(relay relay.Process) error { 597 if !st.initialized.Load() { 598 expectValidatorStage, err := getExpectValidatorStage(st.cfg.ValidatorCfg, st.etcdClient, st.cfg.SourceID, st.cfg.Name, 0) 599 if err != nil { 600 return terror.Annotate(err, "fail to get validator stage from etcd") 601 } 602 st.Run(pb.Stage_Running, expectValidatorStage, relay) 603 return nil 604 } 605 606 if !st.setStageIfIn([]pb.Stage{pb.Stage_Paused, pb.Stage_Stopped}, pb.Stage_Resuming) { 607 return terror.ErrWorkerNotPausedStage.Generate(st.Stage().String()) 608 } 609 610 ctx, cancel := context.WithCancel(st.ctx) 611 st.setCurrCtx(ctx, cancel) 612 // NOTE: this may block if user resume a task 613 err := st.unitTransWaitCondition(ctx) 614 if err != nil { 615 st.l.Error("wait condition", log.ShortError(err)) 616 st.fail(err) 617 return err 618 } else if ctx.Err() != nil { 619 // ctx.Err() != nil means this context is canceled in other go routine, 620 // that go routine will change the stage, so don't need to set stage to paused here. 621 // nolint:nilerr 622 return nil 623 } 624 625 cu := st.CurrUnit() 626 st.l.Info("resume with unit", zap.Stringer("unit", cu.Type())) 627 628 pr := make(chan pb.ProcessResult, 1) 629 st.resultWg.Add(1) 630 go st.fetchResultAndUpdateStage(pr) 631 go cu.Resume(ctx, pr) 632 633 st.setStageAndResult(pb.Stage_Running, nil) // clear previous result 634 return nil 635 } 636 637 // Update update the sub task's config. 638 func (st *SubTask) Update(ctx context.Context, cfg *config.SubTaskConfig) error { 639 if !st.stageCAS(pb.Stage_Paused, pb.Stage_Paused) { // only test for Paused 640 return terror.ErrWorkerUpdateTaskStage.Generate(st.Stage().String()) 641 } 642 643 for _, u := range st.units { 644 err := u.Update(ctx, cfg) 645 if err != nil { 646 return err 647 } 648 } 649 st.SetCfg(*cfg) 650 return nil 651 } 652 653 // OperateSchema operates schema for an upstream table. 654 func (st *SubTask) OperateSchema(ctx context.Context, req *pb.OperateWorkerSchemaRequest) (schema string, err error) { 655 switch req.Op { 656 case pb.SchemaOp_ListMigrateTargets: 657 if st.Stage() != pb.Stage_Running && st.Stage() != pb.Stage_Paused { 658 return "", terror.ErrWorkerNotPausedStage.Generate(st.Stage().String()) 659 } 660 default: 661 if st.Stage() != pb.Stage_Paused { 662 return "", terror.ErrWorkerNotPausedStage.Generate(st.Stage().String()) 663 } 664 } 665 666 syncUnit, ok := st.currUnit.(*syncer.Syncer) 667 if !ok { 668 return "", terror.ErrWorkerOperSyncUnitOnly.Generate(st.currUnit.Type()) 669 } 670 671 if st.validatorStage() == pb.Stage_Running && req.Op != pb.SchemaOp_ListMigrateTargets { 672 return "", terror.ErrWorkerValidatorNotPaused.Generate(pb.Stage_Running.String()) 673 } 674 675 return syncUnit.OperateSchema(ctx, req) 676 } 677 678 // CheckUnit checks whether current unit is sync unit. 679 func (st *SubTask) CheckUnit() bool { 680 st.RLock() 681 defer st.RUnlock() 682 flag := true 683 if _, ok := st.currUnit.(*syncer.Syncer); !ok { 684 flag = false 685 } 686 return flag 687 } 688 689 // CheckUnitCfgCanUpdate checks this unit cfg can update. 690 func (st *SubTask) CheckUnitCfgCanUpdate(cfg *config.SubTaskConfig) error { 691 st.RLock() 692 defer st.RUnlock() 693 694 if st.currUnit == nil { 695 return terror.ErrWorkerUpdateSubTaskConfig.Generate(cfg.Name, pb.UnitType_InvalidUnit) 696 } 697 698 switch st.currUnit.Type() { 699 case pb.UnitType_Sync: 700 if s, ok := st.currUnit.(*syncer.Syncer); ok { 701 return s.CheckCanUpdateCfg(cfg) 702 } 703 // skip check for mock sync unit 704 default: 705 return terror.ErrWorkerUpdateSubTaskConfig.Generate(cfg.Name, st.currUnit.Type()) 706 } 707 return nil 708 } 709 710 // ShardDDLOperation returns the current shard DDL lock operation. 711 func (st *SubTask) ShardDDLOperation() *pessimism.Operation { 712 st.RLock() 713 defer st.RUnlock() 714 715 cu := st.currUnit 716 syncer2, ok := cu.(*syncer.Syncer) 717 if !ok { 718 return nil 719 } 720 721 return syncer2.ShardDDLOperation() 722 } 723 724 // unitTransWaitCondition waits when transferring from current unit to next unit. 725 // Currently there is only one wait condition 726 // from Load unit to Sync unit, wait for relay-log catched up with mydumper binlog position. 727 func (st *SubTask) unitTransWaitCondition(subTaskCtx context.Context) error { 728 var ( 729 gset1 mysql.GTIDSet 730 gset2 mysql.GTIDSet 731 pos1 *mysql.Position 732 pos2 *mysql.Position 733 err error 734 ) 735 pu := st.PrevUnit() 736 cu := st.CurrUnit() 737 if pu != nil && pu.Type() == pb.UnitType_Load && cu.Type() == pb.UnitType_Sync { 738 st.l.Info("wait condition between two units", zap.Stringer("previous unit", pu.Type()), zap.Stringer("unit", cu.Type())) 739 hub := GetConditionHub() 740 741 if !hub.w.relayEnabled.Load() { 742 return nil 743 } 744 745 ctxWait, cancelWait := context.WithTimeout(hub.w.ctx, waitRelayCatchupTimeout) 746 defer cancelWait() 747 748 loadStatus := pu.Status(nil).(*pb.LoadStatus) 749 750 cfg := st.getCfg() 751 if cfg.EnableGTID { 752 gset1, err = gtid.ParserGTID(cfg.Flavor, loadStatus.MetaBinlogGTID) 753 if err != nil { 754 return terror.WithClass(err, terror.ClassDMWorker) 755 } 756 } else { 757 pos1, err = utils.DecodeBinlogPosition(loadStatus.MetaBinlog) 758 if err != nil { 759 return terror.WithClass(err, terror.ClassDMWorker) 760 } 761 } 762 763 for { 764 relayStatus := hub.w.relayHolder.Status(nil) 765 766 if cfg.EnableGTID { 767 gset2, err = gtid.ParserGTID(cfg.Flavor, relayStatus.RelayBinlogGtid) 768 if err != nil { 769 return terror.WithClass(err, terror.ClassDMWorker) 770 } 771 rc, ok := binlog.CompareGTID(gset1, gset2) 772 if !ok { 773 return terror.ErrWorkerWaitRelayCatchupGTID.Generate(loadStatus.MetaBinlogGTID, relayStatus.RelayBinlogGtid) 774 } 775 if rc <= 0 { 776 break 777 } 778 } else { 779 pos2, err = utils.DecodeBinlogPosition(relayStatus.RelayBinlog) 780 if err != nil { 781 return terror.WithClass(err, terror.ClassDMWorker) 782 } 783 if pos1.Compare(*pos2) <= 0 { 784 break 785 } 786 } 787 788 st.l.Debug("wait relay to catchup", zap.Bool("enableGTID", cfg.EnableGTID), zap.Stringer("load end position", pos1), zap.String("load end gtid", loadStatus.MetaBinlogGTID), zap.Stringer("relay position", pos2), zap.String("relay gtid", relayStatus.RelayBinlogGtid)) 789 790 select { 791 case <-ctxWait.Done(): 792 if cfg.EnableGTID { 793 return terror.ErrWorkerWaitRelayCatchupTimeout.Generate(waitRelayCatchupTimeout, loadStatus.MetaBinlogGTID, relayStatus.RelayBinlogGtid) 794 } 795 return terror.ErrWorkerWaitRelayCatchupTimeout.Generate(waitRelayCatchupTimeout, pos1, pos2) 796 case <-subTaskCtx.Done(): 797 return nil 798 case <-time.After(time.Millisecond * 50): 799 } 800 } 801 st.l.Info("relay binlog pos catchup loader end binlog pos") 802 } 803 return nil 804 } 805 806 func (st *SubTask) fail(err error) { 807 st.setStageAndResult(pb.Stage_Paused, &pb.ProcessResult{ 808 Errors: []*pb.ProcessError{ 809 unit.NewProcessError(err), 810 }, 811 }) 812 } 813 814 // HandleError handle error for syncer unit. 815 func (st *SubTask) HandleError(ctx context.Context, req *pb.HandleWorkerErrorRequest, relay relay.Process) (string, error) { 816 // TODO: do we need lock here? 817 syncUnit, ok := st.currUnit.(*syncer.Syncer) 818 if !ok { 819 return "", terror.ErrWorkerOperSyncUnitOnly.Generate(st.currUnit.Type()) 820 } 821 822 msg, err := syncUnit.HandleError(ctx, req) 823 if err != nil { 824 return "", err 825 } 826 827 if st.Stage() == pb.Stage_Paused && req.Op != pb.ErrorOp_List { 828 err = st.Resume(relay) 829 } 830 return msg, err 831 } 832 833 func (st *SubTask) getCfg() *config.SubTaskConfig { 834 st.RLock() 835 defer st.RUnlock() 836 return st.cfg 837 } 838 839 func (st *SubTask) SetCfg(subTaskConfig config.SubTaskConfig) { 840 st.Lock() 841 st.cfg = &subTaskConfig 842 st.Unlock() 843 } 844 845 func (st *SubTask) UpdateValidatorCfg(validatorCfg config.ValidatorConfig) { 846 st.Lock() 847 // if user start validator on the fly, we update validator mode and start-time 848 st.cfg.ValidatorCfg.Mode = validatorCfg.Mode 849 st.cfg.ValidatorCfg.StartTime = validatorCfg.StartTime 850 st.Unlock() 851 } 852 853 func (st *SubTask) getValidatorStage() pb.Stage { 854 st.RLock() 855 defer st.RUnlock() 856 857 if st.validator != nil { 858 return st.validator.Stage() 859 } 860 return pb.Stage_InvalidStage 861 } 862 863 func updateTaskMetric(task, sourceID string, stage pb.Stage, workerName string) { 864 if stage == pb.Stage_Stopped || stage == pb.Stage_Finished { 865 taskState.DeletePartialMatch(prometheus.Labels{"task": task, "source_id": sourceID}) 866 } else { 867 taskState.WithLabelValues(task, sourceID, workerName).Set(float64(stage)) 868 } 869 } 870 871 func (st *SubTask) GetValidatorError(errState pb.ValidateErrorState) ([]*pb.ValidationError, error) { 872 if validator := st.getValidator(); validator != nil { 873 return validator.GetValidatorError(errState) 874 } 875 cfg := st.getCfg() 876 return nil, terror.ErrValidatorNotFound.Generate(cfg.Name, cfg.SourceID) 877 } 878 879 func (st *SubTask) OperateValidatorError(op pb.ValidationErrOp, errID uint64, isAll bool) error { 880 if validator := st.getValidator(); validator != nil { 881 return validator.OperateValidatorError(op, errID, isAll) 882 } 883 cfg := st.getCfg() 884 return terror.ErrValidatorNotFound.Generate(cfg.Name, cfg.SourceID) 885 } 886 887 func (st *SubTask) UpdateValidator(req *pb.UpdateValidationWorkerRequest) error { 888 if validator := st.getValidator(); validator != nil { 889 return validator.UpdateValidator(req) 890 } 891 cfg := st.getCfg() 892 return terror.ErrValidatorNotFound.Generate(cfg.Name, cfg.SourceID) 893 } 894 895 func (st *SubTask) getValidator() *syncer.DataValidator { 896 st.RLock() 897 defer st.RUnlock() 898 return st.validator 899 } 900 901 func (st *SubTask) GetValidatorStatus() (*pb.ValidationStatus, error) { 902 validator := st.getValidator() 903 if validator == nil { 904 cfg := st.getCfg() 905 return nil, terror.ErrValidatorNotFound.Generate(cfg.Name, cfg.SourceID) 906 } 907 return validator.GetValidatorStatus(), nil 908 } 909 910 func (st *SubTask) GetValidatorTableStatus(filterStatus pb.Stage) ([]*pb.ValidationTableStatus, error) { 911 validator := st.getValidator() 912 if validator == nil { 913 cfg := st.getCfg() 914 return nil, terror.ErrValidatorNotFound.Generate(cfg.Name, cfg.SourceID) 915 } 916 return validator.GetValidatorTableStatus(filterStatus), nil 917 }