github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/worker/server.go (about) 1 // Copyright 2019 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package worker 15 16 import ( 17 "context" 18 "fmt" 19 "net" 20 "sync" 21 "time" 22 23 "github.com/pingcap/errors" 24 toolutils "github.com/pingcap/tidb-tools/pkg/utils" 25 "github.com/pingcap/tiflow/dm/common" 26 "github.com/pingcap/tiflow/dm/config" 27 "github.com/pingcap/tiflow/dm/pb" 28 "github.com/pingcap/tiflow/dm/pkg/binlog" 29 tcontext "github.com/pingcap/tiflow/dm/pkg/context" 30 "github.com/pingcap/tiflow/dm/pkg/etcdutil" 31 "github.com/pingcap/tiflow/dm/pkg/ha" 32 "github.com/pingcap/tiflow/dm/pkg/log" 33 "github.com/pingcap/tiflow/dm/pkg/terror" 34 "github.com/pingcap/tiflow/dm/pkg/utils" 35 "github.com/pingcap/tiflow/dm/syncer" 36 "github.com/pingcap/tiflow/dm/unit" 37 "github.com/soheilhy/cmux" 38 clientv3 "go.etcd.io/etcd/client/v3" 39 "go.uber.org/atomic" 40 "go.uber.org/zap" 41 "google.golang.org/grpc" 42 ) 43 44 var ( 45 cmuxReadTimeout = 10 * time.Second 46 dialTimeout = 3 * time.Second 47 keepaliveTimeout = 3 * time.Second 48 keepaliveTime = 3 * time.Second 49 retryGetSourceBoundConfig = 5 50 retryGetRelayConfig = 5 51 retryConnectSleepTime = time.Second 52 syncMasterEndpointsTime = 3 * time.Second 53 getMinLocForSubTaskFunc = getMinLocForSubTask 54 ) 55 56 // Server accepts RPC requests 57 // dispatches requests to worker 58 // sends responses to RPC client. 59 type Server struct { 60 // closeMu is used to sync Start/Close and protect 5 fields below 61 closeMu sync.Mutex 62 // closed is used to indicate whether dm-worker server is in closed state. 63 closed atomic.Bool 64 // calledClose is used to indicate that dm-worker has received signal to close and closed successfully. 65 // we use this variable to avoid Start() after Close() 66 calledClose bool 67 rootLis net.Listener 68 svr *grpc.Server 69 etcdClient *clientv3.Client 70 // end of closeMu 71 72 wg sync.WaitGroup 73 kaWg sync.WaitGroup 74 httpWg sync.WaitGroup 75 runWg sync.WaitGroup 76 77 ctx context.Context 78 cancel context.CancelFunc 79 80 runCtx context.Context 81 runCancel context.CancelFunc 82 83 kaCtx context.Context 84 kaCancel context.CancelFunc 85 86 cfg *Config 87 88 // mu is used to protect worker and sourceStatus. closeMu should be locked first to avoid 89 // deadlock when closeMu and mu are both acquired. 90 mu sync.Mutex 91 worker *SourceWorker 92 // relay status will never be put in server.sourceStatus 93 sourceStatus pb.SourceStatus 94 } 95 96 // NewServer creates a new Server. 97 func NewServer(cfg *Config) *Server { 98 s := Server{ 99 cfg: cfg, 100 } 101 s.ctx, s.cancel = context.WithCancel(context.Background()) 102 s.closed.Store(true) // not start yet 103 return &s 104 } 105 106 // Start starts to serving. 107 // this function should only exit when can't dail DM-master, for other errors it should not exit. 108 func (s *Server) Start() error { 109 log.L().Info("starting dm-worker server") 110 RegistryMetrics() 111 112 var m cmux.CMux 113 114 s.runCtx, s.runCancel = context.WithCancel(s.ctx) 115 116 // protect member from data race. some functions below like GetRelayConfig, 117 // GetSourceBoundConfig has a built-in timeout so it will not be stuck for a 118 // long time. 119 startErr := func() error { 120 s.closeMu.Lock() 121 defer s.closeMu.Unlock() 122 // if dm-worker has received signal and finished close, start() should not continue 123 if s.calledClose { 124 return terror.ErrWorkerServerClosed 125 } 126 127 tls, err := toolutils.NewTLS(s.cfg.SSLCA, s.cfg.SSLCert, s.cfg.SSLKey, s.cfg.AdvertiseAddr, s.cfg.CertAllowedCN) 128 if err != nil { 129 return terror.ErrWorkerTLSConfigNotValid.Delegate(err) 130 } 131 132 rootLis, err := net.Listen("tcp", s.cfg.WorkerAddr) 133 if err != nil { 134 return terror.ErrWorkerStartService.Delegate(err) 135 } 136 s.rootLis = tls.WrapListener(rootLis) 137 138 s.etcdClient, err = clientv3.New(clientv3.Config{ 139 Endpoints: GetJoinURLs(s.cfg.Join), 140 DialTimeout: dialTimeout, 141 DialKeepAliveTime: keepaliveTime, 142 DialKeepAliveTimeout: keepaliveTimeout, 143 TLS: tls.TLSConfig(), 144 AutoSyncInterval: syncMasterEndpointsTime, 145 }) 146 if err != nil { 147 return err 148 } 149 150 s.setWorker(nil, true) 151 152 s.runWg.Add(1) 153 go func() { 154 s.runBackgroundJob(s.runCtx) 155 s.runWg.Done() 156 }() 157 158 s.startKeepAlive() 159 160 relaySource, revRelay, err := ha.GetRelayConfig(s.etcdClient, s.cfg.Name) 161 if err != nil { 162 return err 163 } 164 if relaySource != nil { 165 log.L().Warn("worker has been assigned relay before keepalive", zap.String("relay source", relaySource.SourceID)) 166 if err2 := s.enableRelay(relaySource, true); err2 != nil { 167 return err2 168 } 169 } 170 171 s.runWg.Add(1) 172 go func(ctx context.Context) { 173 defer s.runWg.Done() 174 // TODO: handle fatal error from observeRelayConfig 175 //nolint:errcheck 176 s.observeRelayConfig(ctx, revRelay) 177 }(s.runCtx) 178 179 bound, sourceCfg, revBound, err := ha.GetSourceBoundConfig(s.etcdClient, s.cfg.Name) 180 if err != nil { 181 return err 182 } 183 if !bound.IsEmpty() { 184 log.L().Warn("worker has been assigned source before keepalive", zap.Stringer("bound", bound), zap.Bool("is deleted", bound.IsDeleted)) 185 if err2 := s.enableHandleSubtasks(sourceCfg, true); err2 != nil { 186 return err2 187 } 188 log.L().Info("started to handle mysql source", zap.String("sourceCfg", sourceCfg.String())) 189 } 190 191 s.runWg.Add(1) 192 go func(ctx context.Context) { 193 defer s.runWg.Done() 194 for { 195 err1 := s.observeSourceBound(ctx, revBound) 196 if err1 == nil { 197 return 198 } 199 s.restartKeepAlive() 200 } 201 }(s.runCtx) 202 203 // create a cmux 204 m = cmux.New(s.rootLis) 205 206 m.SetReadTimeout(cmuxReadTimeout) // set a timeout, ref: https://github.com/pingcap/tidb-binlog/pull/352 207 208 // match connections in order: first gRPC, then HTTP 209 grpcL := m.MatchWithWriters(cmux.HTTP2MatchHeaderFieldSendSettings("content-type", "application/grpc")) 210 211 httpL := m.Match(cmux.HTTP1Fast()) 212 213 // NOTE: don't need to set tls config, because rootLis already use tls 214 s.svr = grpc.NewServer() 215 pb.RegisterWorkerServer(s.svr, s) 216 217 grpcExitCh := make(chan struct{}, 1) 218 s.wg.Add(1) 219 go func() { 220 err2 := s.svr.Serve(grpcL) 221 if err2 != nil && !common.IsErrNetClosing(err2) && err2 != cmux.ErrListenerClosed { 222 log.L().Error("gRPC server returned", log.ShortError(err2)) 223 } 224 grpcExitCh <- struct{}{} 225 }() 226 go func(ctx context.Context) { 227 defer s.wg.Done() 228 select { 229 case <-ctx.Done(): 230 if s.svr != nil { 231 // GracefulStop can not cancel active stream RPCs 232 // and the stream RPC may block on Recv or Send 233 // so we use Stop instead to cancel all active RPCs 234 s.svr.Stop() 235 } 236 case <-grpcExitCh: 237 } 238 }(s.ctx) 239 240 s.httpWg.Add(1) 241 go func() { 242 s.httpWg.Done() 243 InitStatus(httpL) // serve status 244 }() 245 246 s.closed.Store(false) // the server started now. 247 return nil 248 }() 249 250 if startErr != nil { 251 return startErr 252 } 253 254 log.L().Info("listening gRPC API and status request", zap.String("address", s.cfg.WorkerAddr)) 255 256 err := m.Serve() 257 if err != nil && common.IsErrNetClosing(err) { 258 err = nil 259 } 260 return terror.ErrWorkerStartService.Delegate(err) 261 } 262 263 // worker keepalive with master 264 // If worker loses connect from master, it would stop all task and try to connect master again. 265 func (s *Server) startKeepAlive() { 266 s.kaWg.Add(1) 267 s.kaCtx, s.kaCancel = context.WithCancel(s.ctx) 268 go s.doStartKeepAlive() 269 } 270 271 func (s *Server) doStartKeepAlive() { 272 defer s.kaWg.Done() 273 s.KeepAlive() 274 } 275 276 func (s *Server) stopKeepAlive() { 277 if s.kaCancel != nil { 278 s.kaCancel() 279 s.kaWg.Wait() 280 } 281 } 282 283 func (s *Server) restartKeepAlive() { 284 s.stopKeepAlive() 285 s.startKeepAlive() 286 } 287 288 func (s *Server) observeRelayConfig(ctx context.Context, rev int64) error { 289 var wg sync.WaitGroup 290 for { 291 relayCh := make(chan ha.RelaySource, 10) 292 relayErrCh := make(chan error, 10) 293 wg.Add(1) 294 // use ctx1, cancel1 to make sure old watcher has been released 295 ctx1, cancel1 := context.WithCancel(ctx) 296 go func() { 297 defer func() { 298 close(relayCh) 299 close(relayErrCh) 300 wg.Done() 301 }() 302 ha.WatchRelayConfig(ctx1, s.etcdClient, s.cfg.Name, rev+1, relayCh, relayErrCh) 303 }() 304 err := s.handleRelayConfig(ctx1, relayCh, relayErrCh) 305 cancel1() 306 wg.Wait() 307 308 if etcdutil.IsRetryableError(err) { 309 rev = 0 310 retryNum := 1 311 for rev == 0 { 312 select { 313 case <-ctx.Done(): 314 return nil 315 case <-time.After(500 * time.Millisecond): 316 relaySource, rev1, err1 := ha.GetRelayConfig(s.etcdClient, s.cfg.Name) 317 if err1 != nil { 318 log.L().Error("get relay config from etcd failed, will retry later", zap.Error(err1), zap.Int("retryNum", retryNum)) 319 retryNum++ 320 if retryNum > retryGetRelayConfig && etcdutil.IsLimitedRetryableError(err1) { 321 return err1 322 } 323 break 324 } 325 rev = rev1 326 if relaySource == nil { 327 if w := s.getSourceWorker(true); w != nil && w.startedRelayBySourceCfg { 328 break 329 } 330 log.L().Info("didn't found relay config after etcd retryable error. Will stop relay now") 331 err = s.disableRelay("") 332 if err != nil { 333 log.L().Error("fail to disableRelay after etcd retryable error", zap.Error(err)) 334 return err // return if failed to stop the worker. 335 } 336 } else { 337 err2 := func() error { 338 s.mu.Lock() 339 defer s.mu.Unlock() 340 341 if w := s.getSourceWorker(false); w != nil && w.cfg.SourceID == relaySource.SourceID { 342 // we may face both relay config and subtask bound changed in a compaction error, so here 343 // we check if observeSourceBound has started a worker 344 // TODO: add a test for this situation 345 if !w.relayEnabled.Load() { 346 if err2 := w.EnableRelay(false); err2 != nil { 347 return err2 348 } 349 } 350 return nil 351 } 352 err = s.stopSourceWorker("", false, true) 353 if err != nil { 354 log.L().Error("fail to stop worker", zap.Error(err)) 355 return err // return if failed to stop the worker. 356 } 357 log.L().Info("will recover observeRelayConfig", 358 zap.String("relay source", relaySource.SourceID)) 359 return s.enableRelay(relaySource, false) 360 }() 361 if err2 != nil { 362 return err2 363 } 364 } 365 } 366 } 367 } else { 368 if err != nil { 369 log.L().Error("observeRelayConfig is failed and will quit now", zap.Error(err)) 370 } else { 371 log.L().Info("observeRelayConfig will quit now") 372 } 373 return err 374 } 375 } 376 } 377 378 // observeSourceBound will 379 // 1. keep bound relation updated from DM-master 380 // 2. keep enable-relay in source config updated. (TODO) This relies on DM-master re-put SourceBound after change it. 381 func (s *Server) observeSourceBound(ctx context.Context, rev int64) error { 382 var wg sync.WaitGroup 383 for { 384 sourceBoundCh := make(chan ha.SourceBound, 10) 385 sourceBoundErrCh := make(chan error, 10) 386 wg.Add(1) 387 // use ctx1, cancel1 to make sure old watcher has been released 388 ctx1, cancel1 := context.WithCancel(ctx) 389 go func() { 390 defer func() { 391 close(sourceBoundCh) 392 close(sourceBoundErrCh) 393 wg.Done() 394 }() 395 ha.WatchSourceBound(ctx1, s.etcdClient, s.cfg.Name, rev+1, sourceBoundCh, sourceBoundErrCh) 396 }() 397 err := s.handleSourceBound(ctx1, sourceBoundCh, sourceBoundErrCh) 398 cancel1() 399 wg.Wait() 400 401 if etcdutil.IsRetryableError(err) { 402 rev = 0 403 retryNum := 1 404 for rev == 0 { 405 select { 406 case <-ctx.Done(): 407 return nil 408 case <-time.After(500 * time.Millisecond): 409 bound, cfg, rev1, err1 := ha.GetSourceBoundConfig(s.etcdClient, s.cfg.Name) 410 if err1 != nil { 411 log.L().Error("get source bound from etcd failed, will retry later", zap.Error(err1), zap.Int("retryNum", retryNum)) 412 retryNum++ 413 if retryNum > retryGetSourceBoundConfig && etcdutil.IsLimitedRetryableError(err1) { 414 return err1 415 } 416 break 417 } 418 rev = rev1 419 if bound.IsEmpty() { 420 err = s.disableHandleSubtasks("") 421 if err != nil { 422 log.L().Error("fail to disableHandleSubtasks after etcd retryable error", zap.Error(err)) 423 return err // return if failed to stop the worker. 424 } 425 } else { 426 err2 := func() error { 427 s.mu.Lock() 428 defer s.mu.Unlock() 429 430 if w := s.getSourceWorker(false); w != nil && w.cfg.SourceID == bound.Source { 431 // we may face both relay config and subtask bound changed in a compaction error, so here 432 // we check if observeRelayConfig has started a worker 433 // TODO: add a test for this situation 434 if !w.subTaskEnabled.Load() { 435 if err2 := w.EnableHandleSubtasks(); err2 != nil { 436 return err2 437 } 438 } 439 return nil 440 } 441 err = s.stopSourceWorker("", false, true) 442 if err != nil { 443 log.L().Error("fail to stop worker", zap.Error(err)) 444 return err // return if failed to stop the worker. 445 } 446 log.L().Info("will recover observeSourceBound", 447 zap.String("relay source", cfg.SourceID)) 448 return s.enableHandleSubtasks(cfg, false) 449 }() 450 if err2 != nil { 451 if terror.ErrWorkerServerClosed.Equal(err2) { 452 // return nil to exit the loop in caller 453 return nil 454 } 455 return err2 456 } 457 } 458 } 459 } 460 } else { 461 if err != nil { 462 log.L().Error("observeSourceBound is failed and will quit now", zap.Error(err)) 463 } else { 464 log.L().Info("observeSourceBound will quit now") 465 } 466 return err 467 } 468 } 469 } 470 471 func (s *Server) doClose() { 472 if s.closed.Load() { 473 return 474 } 475 // stop server in advance, stop receiving source bound and relay bound 476 s.runCancel() 477 s.runWg.Wait() 478 479 // stop worker and wait for return(we already lock the whole Sever, so no need use lock to get source worker) 480 if w := s.getSourceWorker(true); w != nil { 481 w.Stop(true) 482 } 483 484 // close listener at last, so we can get status from it if worker failed to close in previous step 485 if s.rootLis != nil { 486 err2 := s.rootLis.Close() 487 if err2 != nil && !common.IsErrNetClosing(err2) { 488 log.L().Error("fail to close net listener", log.ShortError(err2)) 489 } 490 } 491 s.httpWg.Wait() 492 493 s.closed.Store(true) 494 } 495 496 // Close closes the RPC server, this function can be called multiple times. 497 func (s *Server) Close() { 498 s.closeMu.Lock() 499 defer s.closeMu.Unlock() 500 s.doClose() // we should stop current sync first, otherwise master may schedule task on new worker while we are closing 501 s.stopKeepAlive() 502 503 s.cancel() 504 s.wg.Wait() 505 506 if s.etcdClient != nil { 507 s.etcdClient.Close() 508 } 509 s.calledClose = true 510 } 511 512 // if needLock is false, we should make sure Server has been locked in caller. 513 func (s *Server) getSourceWorker(needLock bool) *SourceWorker { 514 if needLock { 515 s.mu.Lock() 516 defer s.mu.Unlock() 517 } 518 return s.worker 519 } 520 521 // if needLock is false, we should make sure Server has been locked in caller. 522 func (s *Server) setWorker(worker *SourceWorker, needLock bool) { 523 if needLock { 524 s.mu.Lock() 525 defer s.mu.Unlock() 526 } 527 s.worker = worker 528 } 529 530 // nolint:unparam 531 func (s *Server) getSourceStatus(needLock bool) pb.SourceStatus { 532 if needLock { 533 s.mu.Lock() 534 defer s.mu.Unlock() 535 } 536 return s.sourceStatus 537 } 538 539 // TODO: move some call to setWorker/getOrStartWorker. 540 func (s *Server) setSourceStatus(source string, err error, needLock bool) { 541 if needLock { 542 s.mu.Lock() 543 defer s.mu.Unlock() 544 } 545 // now setSourceStatus will be concurrently called. skip setting a source status if worker has been closed 546 if s.getSourceWorker(false) == nil && source != "" { 547 return 548 } 549 s.sourceStatus = pb.SourceStatus{ 550 Source: source, 551 Worker: s.cfg.Name, 552 } 553 if err != nil { 554 s.sourceStatus.Result = &pb.ProcessResult{ 555 Errors: []*pb.ProcessError{ 556 unit.NewProcessError(err), 557 }, 558 } 559 } 560 } 561 562 // if sourceID is set to "", worker will be closed directly 563 // if sourceID is not "", we will check sourceID with w.cfg.SourceID. 564 func (s *Server) stopSourceWorker(sourceID string, needLock, graceful bool) error { 565 if needLock { 566 s.mu.Lock() 567 defer s.mu.Unlock() 568 } 569 w := s.getSourceWorker(false) 570 if w == nil { 571 log.L().Warn("worker has not been started, no need to stop", zap.String("source", sourceID)) 572 return nil // no need to stop because not started yet 573 } 574 if sourceID != "" && w.cfg.SourceID != sourceID { 575 return terror.ErrWorkerSourceNotMatch 576 } 577 s.UpdateKeepAliveTTL(s.cfg.KeepAliveTTL) 578 s.setWorker(nil, false) 579 s.setSourceStatus("", nil, false) 580 w.Stop(graceful) 581 return nil 582 } 583 584 func (s *Server) handleSourceBound(ctx context.Context, boundCh chan ha.SourceBound, errCh chan error) error { 585 OUTER: 586 for { 587 select { 588 case <-ctx.Done(): 589 break OUTER 590 case bound, ok := <-boundCh: 591 if !ok { 592 break OUTER 593 } 594 log.L().Info("receive source bound", zap.Stringer("bound", bound), zap.Bool("is deleted", bound.IsDeleted)) 595 err := s.operateSourceBound(bound) 596 s.setSourceStatus(bound.Source, err, true) 597 if err != nil { 598 opErrCounter.WithLabelValues(s.cfg.Name, opErrTypeSourceBound).Inc() 599 log.L().Error("fail to operate sourceBound on worker", zap.Stringer("bound", bound), zap.Bool("is deleted", bound.IsDeleted), zap.Error(err)) 600 if etcdutil.IsRetryableError(err) { 601 return err 602 } 603 } 604 case err, ok := <-errCh: 605 if !ok { 606 break OUTER 607 } 608 // TODO: Deal with err 609 log.L().Error("WatchSourceBound received an error", zap.Error(err)) 610 if etcdutil.IsRetryableError(err) { 611 return err 612 } 613 } 614 } 615 log.L().Info("handleSourceBound will quit now") 616 return nil 617 } 618 619 func (s *Server) handleRelayConfig(ctx context.Context, relayCh chan ha.RelaySource, errCh chan error) error { 620 OUTER: 621 for { 622 select { 623 case <-ctx.Done(): 624 break OUTER 625 case relaySource, ok := <-relayCh: 626 if !ok { 627 break OUTER 628 } 629 log.L().Info("receive relay source", zap.String("relay source", relaySource.Source), zap.Bool("is deleted", relaySource.IsDeleted)) 630 err := s.operateRelaySource(relaySource) 631 s.setSourceStatus(relaySource.Source, err, true) 632 if err != nil { 633 opErrCounter.WithLabelValues(s.cfg.Name, opErrTypeRelaySource).Inc() 634 log.L().Error("fail to operate relay source on worker", 635 zap.String("relay source", relaySource.Source), 636 zap.Bool("is deleted", relaySource.IsDeleted), 637 zap.Error(err)) 638 if etcdutil.IsRetryableError(err) { 639 return err 640 } 641 } 642 case err, ok := <-errCh: 643 // currently no value is sent to errCh 644 if !ok { 645 break OUTER 646 } 647 // TODO: Deal with err 648 log.L().Error("WatchRelayConfig received an error", zap.Error(err)) 649 if etcdutil.IsRetryableError(err) { 650 return err 651 } 652 } 653 } 654 log.L().Info("worker server is closed, handleRelayConfig will quit now") 655 return nil 656 } 657 658 func (s *Server) operateSourceBound(bound ha.SourceBound) error { 659 if bound.IsDeleted { 660 return s.disableHandleSubtasks(bound.Source) 661 } 662 scm, _, err := ha.GetSourceCfg(s.etcdClient, bound.Source, bound.Revision) 663 if err != nil { 664 // TODO: need retry 665 return err 666 } 667 sourceCfg, ok := scm[bound.Source] 668 if !ok { 669 return terror.ErrWorkerFailToGetSourceConfigFromEtcd.Generate(bound.Source) 670 } 671 return s.enableHandleSubtasks(sourceCfg, true) 672 } 673 674 func (s *Server) enableHandleSubtasks(sourceCfg *config.SourceConfig, needLock bool) error { 675 if needLock { 676 s.mu.Lock() 677 defer s.mu.Unlock() 678 } 679 680 w, err := s.getOrStartWorker(sourceCfg, false) 681 s.setSourceStatus(sourceCfg.SourceID, err, false) 682 if err != nil { 683 return err 684 } 685 686 if sourceCfg.EnableRelay { 687 log.L().Info("will start relay by `enable-relay` in source config") 688 if err2 := w.EnableRelay(true); err2 != nil { 689 log.L().Error("found a `enable-relay: true` source, but failed to enable relay for DM worker", 690 zap.Error(err2)) 691 return err2 692 } 693 } else if w.startedRelayBySourceCfg { 694 log.L().Info("will disable relay by `enable-relay: false` in source config") 695 w.DisableRelay() 696 } 697 698 if err2 := w.EnableHandleSubtasks(); err2 != nil { 699 s.setSourceStatus(sourceCfg.SourceID, err2, false) 700 return err2 701 } 702 return nil 703 } 704 705 func (s *Server) disableHandleSubtasks(source string) error { 706 s.mu.Lock() 707 defer s.mu.Unlock() 708 w := s.getSourceWorker(false) 709 if w == nil { 710 log.L().Warn("worker has already stopped before DisableHandleSubtasks", zap.String("source", source)) 711 return nil 712 } 713 714 w.DisableHandleSubtasks() 715 716 // now the worker is unbound, stop relay if it's started by source config 717 if w.cfg.EnableRelay && w.startedRelayBySourceCfg { 718 log.L().Info("stop relay because the source is unbound") 719 w.DisableRelay() 720 } 721 722 var err error 723 if !w.relayEnabled.Load() { 724 log.L().Info("relay is not enabled after disabling subtask, so stop worker") 725 err = s.stopSourceWorker(source, false, true) 726 } 727 return err 728 } 729 730 func (s *Server) operateRelaySource(relaySource ha.RelaySource) error { 731 if relaySource.IsDeleted { 732 return s.disableRelay(relaySource.Source) 733 } 734 scm, _, err := ha.GetSourceCfg(s.etcdClient, relaySource.Source, relaySource.Revision) 735 if err != nil { 736 // TODO: need retry 737 return err 738 } 739 sourceCfg, ok := scm[relaySource.Source] 740 if !ok { 741 return terror.ErrWorkerFailToGetSourceConfigFromEtcd.Generate(relaySource.Source) 742 } 743 return s.enableRelay(sourceCfg, true) 744 } 745 746 func (s *Server) enableRelay(sourceCfg *config.SourceConfig, needLock bool) error { 747 if needLock { 748 s.mu.Lock() 749 defer s.mu.Unlock() 750 } 751 752 w, err2 := s.getOrStartWorker(sourceCfg, false) 753 s.setSourceStatus(sourceCfg.SourceID, err2, false) 754 if err2 != nil { 755 // if DM-worker can't handle pre-assigned source before keepalive, it simply exits with the error, 756 // because no re-assigned mechanism exists for keepalived DM-worker yet. 757 return err2 758 } 759 if err2 = w.EnableRelay(false); err2 != nil { 760 s.setSourceStatus(sourceCfg.SourceID, err2, false) 761 return err2 762 } 763 s.UpdateKeepAliveTTL(s.cfg.RelayKeepAliveTTL) 764 return nil 765 } 766 767 func (s *Server) disableRelay(source string) error { 768 s.mu.Lock() 769 defer s.mu.Unlock() 770 w := s.getSourceWorker(false) 771 if w == nil { 772 log.L().Warn("worker has already stopped before DisableRelay", zap.Any("relaySource", source)) 773 return nil 774 } 775 s.UpdateKeepAliveTTL(s.cfg.KeepAliveTTL) 776 w.DisableRelay() 777 var err error 778 if !w.subTaskEnabled.Load() { 779 log.L().Info("subtask is not enabled after disabling relay, so stop worker") 780 err = s.stopSourceWorker(source, false, true) 781 } 782 return err 783 } 784 785 // QueryStatus implements WorkerServer.QueryStatus. 786 func (s *Server) QueryStatus(ctx context.Context, req *pb.QueryStatusRequest) (*pb.QueryStatusResponse, error) { 787 log.L().Info("", zap.String("request", "QueryStatus"), zap.Stringer("payload", req)) 788 789 sourceStatus := s.getSourceStatus(true) 790 sourceStatus.Worker = s.cfg.Name 791 resp := &pb.QueryStatusResponse{ 792 Result: true, 793 SourceStatus: &sourceStatus, 794 } 795 796 w := s.getSourceWorker(true) 797 if w == nil { 798 log.L().Warn("fail to call QueryStatus, because no mysql source is being handled in the worker") 799 resp.Result = false 800 resp.Msg = terror.ErrWorkerNoStart.Error() 801 return resp, nil 802 } 803 804 var err error 805 resp.SubTaskStatus, sourceStatus.RelayStatus, err = w.QueryStatus(ctx, req.Name) 806 807 if err != nil { 808 resp.Msg = fmt.Sprintf("error when get master status: %v", err) 809 } else if len(resp.SubTaskStatus) == 0 { 810 resp.Msg = "no sub task started" 811 } 812 return resp, nil 813 } 814 815 // PurgeRelay implements WorkerServer.PurgeRelay. 816 func (s *Server) PurgeRelay(ctx context.Context, req *pb.PurgeRelayRequest) (*pb.CommonWorkerResponse, error) { 817 log.L().Info("", zap.String("request", "PurgeRelay"), zap.Stringer("payload", req)) 818 w := s.getSourceWorker(true) 819 if w == nil { 820 log.L().Warn("fail to call StartSubTask, because no mysql source is being handled in the worker") 821 return makeCommonWorkerResponse(terror.ErrWorkerNoStart.Generate()), nil 822 } 823 824 err := w.PurgeRelay(ctx, req) 825 if err != nil { 826 log.L().Error("fail to purge relay", zap.String("request", "PurgeRelay"), zap.Stringer("payload", req), zap.Error(err)) 827 } 828 return makeCommonWorkerResponse(err), nil 829 } 830 831 // OperateSchema operates schema for an upstream table. 832 func (s *Server) OperateSchema(ctx context.Context, req *pb.OperateWorkerSchemaRequest) (*pb.CommonWorkerResponse, error) { 833 log.L().Info("", zap.String("request", "OperateSchema"), zap.Stringer("payload", req)) 834 835 w := s.getSourceWorker(true) 836 if w == nil { 837 log.L().Warn("fail to call OperateSchema, because no mysql source is being handled in the worker") 838 return makeCommonWorkerResponse(terror.ErrWorkerNoStart.Generate()), nil 839 } 840 w.RLock() 841 sourceID := w.cfg.SourceID 842 w.RUnlock() 843 if req.Source != sourceID { 844 log.L().Error("fail to call OperateSchema, because source mismatch", zap.String("request", req.Source), zap.String("current", sourceID)) 845 return makeCommonWorkerResponse(terror.ErrWorkerSourceNotMatch.Generate()), nil 846 } 847 848 schema, err := w.OperateSchema(ctx, req) 849 if err != nil { 850 return makeCommonWorkerResponse(err), nil 851 } 852 return &pb.CommonWorkerResponse{ 853 Result: true, 854 Msg: schema, // if any schema return for `GET`, we place it in the `msg` field now. 855 Source: req.Source, 856 Worker: s.cfg.Name, 857 }, nil 858 } 859 860 func (s *Server) getOrStartWorker(cfg *config.SourceConfig, needLock bool) (*SourceWorker, error) { 861 if needLock { 862 s.mu.Lock() 863 defer s.mu.Unlock() 864 } 865 866 if w := s.getSourceWorker(false); w != nil { 867 if w.cfg.SourceID == cfg.SourceID { 868 log.L().Info("mysql source is being handled", zap.String("sourceID", s.worker.cfg.SourceID)) 869 return w, nil 870 } 871 return nil, terror.ErrWorkerAlreadyStart.Generate(w.name, w.cfg.SourceID, cfg.SourceID) 872 } 873 874 log.L().Info("will start a new worker", zap.String("sourceID", cfg.SourceID)) 875 w, err := NewSourceWorker(cfg, s.etcdClient, s.cfg.Name, s.cfg.RelayDir) 876 if err != nil { 877 return nil, err 878 } 879 s.setWorker(w, false) 880 881 go w.Start() 882 883 isStarted := utils.WaitSomething(50, 100*time.Millisecond, func() bool { 884 return !w.closed.Load() 885 }) 886 if !isStarted { 887 // TODO: add more mechanism to wait or un-bound the source 888 return nil, terror.ErrWorkerNoStart 889 } 890 return w, nil 891 } 892 893 func makeCommonWorkerResponse(reqErr error) *pb.CommonWorkerResponse { 894 resp := &pb.CommonWorkerResponse{ 895 Result: true, 896 } 897 if reqErr != nil { 898 resp.Result = false 899 resp.Msg = reqErr.Error() 900 } 901 return resp 902 } 903 904 // all subTask in subTaskCfgs should have same source 905 // this function return the min location in all subtasks, used for relay's location. 906 func getMinLocInAllSubTasks(ctx context.Context, subTaskCfgs map[string]config.SubTaskConfig) (minLoc *binlog.Location, err error) { 907 for _, subTaskCfg := range subTaskCfgs { 908 loc, err := getMinLocForSubTaskFunc(ctx, subTaskCfg) 909 if err != nil { 910 return nil, err 911 } 912 913 if loc == nil { 914 continue 915 } 916 917 if minLoc == nil { 918 minLoc = loc 919 } else if binlog.CompareLocation(*minLoc, *loc, subTaskCfg.EnableGTID) >= 1 { 920 minLoc = loc 921 } 922 } 923 924 return minLoc, nil 925 } 926 927 func getMinLocForSubTask(ctx context.Context, subTaskCfg config.SubTaskConfig) (minLoc *binlog.Location, err error) { 928 if !config.HasSync(subTaskCfg.Mode) { 929 return nil, nil 930 } 931 subTaskCfg2, err := subTaskCfg.DecryptedClone() 932 if err != nil { 933 return nil, errors.Annotate(err, "get min position from checkpoint") 934 } 935 936 tctx := tcontext.NewContext(ctx, log.L()) 937 checkpoint := syncer.NewRemoteCheckPoint(tctx, subTaskCfg2, nil, subTaskCfg2.SourceID) 938 err = checkpoint.Init(tctx) 939 if err != nil { 940 return nil, errors.Annotate(err, "get min position from checkpoint") 941 } 942 defer checkpoint.Close() 943 944 err = checkpoint.Load(tctx) 945 if err != nil { 946 return nil, errors.Annotate(err, "get min position from checkpoint") 947 } 948 949 location := checkpoint.GlobalPoint() 950 return &location, nil 951 } 952 953 // HandleError handle error. 954 func (s *Server) HandleError(ctx context.Context, req *pb.HandleWorkerErrorRequest) (*pb.CommonWorkerResponse, error) { 955 log.L().Info("", zap.String("request", "HandleError"), zap.Stringer("payload", req)) 956 957 w := s.getSourceWorker(true) 958 if w == nil { 959 log.L().Warn("fail to call HandleError, because no mysql source is being handled in the worker") 960 return makeCommonWorkerResponse(terror.ErrWorkerNoStart.Generate()), nil 961 } 962 963 msg, err := w.HandleError(ctx, req) 964 if err != nil { 965 return makeCommonWorkerResponse(err), nil 966 } 967 return &pb.CommonWorkerResponse{ 968 Result: true, 969 Worker: s.cfg.Name, 970 Msg: msg, 971 }, nil 972 } 973 974 // GetWorkerCfg get worker config. 975 func (s *Server) GetWorkerCfg(ctx context.Context, req *pb.GetWorkerCfgRequest) (*pb.GetWorkerCfgResponse, error) { 976 log.L().Info("", zap.String("request", "GetWorkerCfg"), zap.Stringer("payload", req)) 977 var err error 978 resp := &pb.GetWorkerCfgResponse{} 979 980 resp.Cfg, err = s.cfg.Toml() 981 return resp, err 982 } 983 984 // CheckSubtasksCanUpdate check if input subtask cfg can be updated. 985 func (s *Server) CheckSubtasksCanUpdate(ctx context.Context, req *pb.CheckSubtasksCanUpdateRequest) (*pb.CheckSubtasksCanUpdateResponse, error) { 986 log.L().Info("", zap.String("request", "CheckSubtasksCanUpdate"), zap.Stringer("payload", req)) 987 resp := &pb.CheckSubtasksCanUpdateResponse{} 988 defer func() { 989 log.L().Info("", zap.String("request", "CheckSubtasksCanUpdate"), zap.Stringer("resp", resp)) 990 }() 991 w := s.getSourceWorker(true) 992 if w == nil { 993 msg := "fail to call CheckSubtasksCanUpdate, because no mysql source is being handled in the worker" 994 log.L().Warn(msg) 995 resp.Msg = msg 996 return resp, nil 997 } 998 cfg := config.NewSubTaskConfig() 999 if err := cfg.Decode(req.SubtaskCfgTomlString, false); err != nil { 1000 resp.Msg = err.Error() 1001 // nolint:nilerr 1002 return resp, nil 1003 } 1004 if err := w.CheckCfgCanUpdated(cfg); err != nil { 1005 resp.Msg = err.Error() 1006 // nolint:nilerr 1007 return resp, nil 1008 } 1009 resp.Success = true 1010 return resp, nil 1011 } 1012 1013 func (s *Server) GetWorkerValidatorStatus(ctx context.Context, req *pb.GetValidationStatusRequest) (*pb.GetValidationStatusResponse, error) { 1014 log.L().Info("", zap.String("request", "GetWorkerValidateStatus"), zap.Stringer("payload", req)) 1015 1016 resp := &pb.GetValidationStatusResponse{ 1017 Result: true, 1018 } 1019 w := s.getSourceWorker(true) 1020 if w == nil { 1021 log.L().Warn("fail to call GetWorkerValidateStatus, because no mysql source is being handled in the worker") 1022 resp.Result = false 1023 resp.Msg = terror.ErrWorkerNoStart.Error() 1024 return resp, nil 1025 } 1026 validatorStatus, err := w.GetValidatorStatus(req.TaskName) 1027 if err != nil { 1028 return resp, err 1029 } 1030 res, err := w.GetValidatorTableStatus(req.TaskName, req.FilterStatus) 1031 if err != nil { 1032 return resp, err 1033 } 1034 1035 resp.Validators = []*pb.ValidationStatus{validatorStatus} 1036 resp.TableStatuses = res 1037 return resp, nil 1038 } 1039 1040 func (s *Server) GetValidatorError(ctx context.Context, req *pb.GetValidationErrorRequest) (*pb.GetValidationErrorResponse, error) { 1041 w := s.getSourceWorker(true) 1042 resp := &pb.GetValidationErrorResponse{ 1043 Result: true, 1044 } 1045 if w == nil { 1046 log.L().Warn("fail to get validator error, because no mysql source is being handled in the worker") 1047 resp.Result = false 1048 resp.Msg = terror.ErrWorkerNoStart.Error() 1049 return resp, nil 1050 } 1051 validatorErrs, err := w.GetWorkerValidatorErr(req.TaskName, req.ErrState) 1052 if err != nil { 1053 resp.Msg = err.Error() 1054 resp.Result = false 1055 } else { 1056 resp.Error = validatorErrs 1057 } 1058 return resp, nil 1059 } 1060 1061 func (s *Server) OperateValidatorError(ctx context.Context, req *pb.OperateValidationErrorRequest) (*pb.OperateValidationErrorResponse, error) { 1062 log.L().Info("operate validation error", zap.Stringer("payload", req)) 1063 w := s.getSourceWorker(true) 1064 resp := &pb.OperateValidationErrorResponse{ 1065 Result: true, 1066 } 1067 if w == nil { 1068 log.L().Warn("fail to operate validator error, because no mysql source is being handled in the worker") 1069 resp.Result = false 1070 resp.Msg = terror.ErrWorkerNoStart.Error() 1071 return resp, nil 1072 } 1073 err := w.OperateWorkerValidatorErr(req.TaskName, req.Op, req.ErrId, req.IsAllError) 1074 if err != nil { 1075 resp.Result = false 1076 resp.Msg = err.Error() 1077 //nolint:nilerr 1078 return resp, nil 1079 } 1080 //nolint:nilerr 1081 return resp, nil 1082 } 1083 1084 func (s *Server) UpdateValidator(ctx context.Context, req *pb.UpdateValidationWorkerRequest) (*pb.CommonWorkerResponse, error) { 1085 log.L().Info("update validation", zap.Stringer("payload", req)) 1086 w := s.getSourceWorker(true) 1087 resp := &pb.CommonWorkerResponse{ 1088 Result: true, 1089 } 1090 if w == nil { 1091 log.L().Warn("fail to update validator, because no mysql source is being handled in the worker") 1092 resp.Result = false 1093 resp.Msg = terror.ErrWorkerNoStart.Error() 1094 return resp, nil 1095 } 1096 err := w.UpdateWorkerValidator(req) 1097 if err != nil { 1098 resp.Result = false 1099 resp.Msg = err.Error() 1100 //nolint:nilerr 1101 return resp, nil 1102 } 1103 resp.Source = w.cfg.SourceID 1104 resp.Worker = s.cfg.Name 1105 //nolint:nilerr 1106 return resp, nil 1107 }