github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/executor/server.go (about) 1 // Copyright 2022 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package executor 15 16 import ( 17 "context" 18 "net/http" 19 "net/http/pprof" 20 "strings" 21 "time" 22 23 grpcprometheus "github.com/grpc-ecosystem/go-grpc-prometheus" 24 "github.com/pingcap/log" 25 "github.com/pingcap/tidb/pkg/util/gctuner" 26 "github.com/pingcap/tidb/pkg/util/memory" 27 "github.com/pingcap/tiflow/dm/common" 28 pb "github.com/pingcap/tiflow/engine/enginepb" 29 "github.com/pingcap/tiflow/engine/executor/server" 30 "github.com/pingcap/tiflow/engine/executor/worker" 31 "github.com/pingcap/tiflow/engine/framework" 32 frameLog "github.com/pingcap/tiflow/engine/framework/logutil" 33 frameModel "github.com/pingcap/tiflow/engine/framework/model" 34 "github.com/pingcap/tiflow/engine/framework/registry" 35 "github.com/pingcap/tiflow/engine/framework/taskutil" 36 "github.com/pingcap/tiflow/engine/internal/pkg/discovery" 37 "github.com/pingcap/tiflow/engine/model" 38 pkgClient "github.com/pingcap/tiflow/engine/pkg/client" 39 dcontext "github.com/pingcap/tiflow/engine/pkg/context" 40 "github.com/pingcap/tiflow/engine/pkg/deps" 41 "github.com/pingcap/tiflow/engine/pkg/externalresource/broker" 42 metaModel "github.com/pingcap/tiflow/engine/pkg/meta/model" 43 "github.com/pingcap/tiflow/engine/pkg/openapi" 44 pkgOrm "github.com/pingcap/tiflow/engine/pkg/orm" 45 "github.com/pingcap/tiflow/engine/pkg/p2p" 46 "github.com/pingcap/tiflow/engine/pkg/promutil" 47 "github.com/pingcap/tiflow/engine/pkg/rpcutil" 48 "github.com/pingcap/tiflow/engine/pkg/tenant" 49 "github.com/pingcap/tiflow/engine/test/mock" 50 "github.com/pingcap/tiflow/pkg/errors" 51 "github.com/pingcap/tiflow/pkg/errorutil" 52 "github.com/pingcap/tiflow/pkg/logutil" 53 p2pImpl "github.com/pingcap/tiflow/pkg/p2p" 54 "github.com/pingcap/tiflow/pkg/security" 55 "github.com/pingcap/tiflow/pkg/tcpserver" 56 "go.uber.org/dig" 57 "go.uber.org/zap" 58 "golang.org/x/sync/errgroup" 59 "golang.org/x/time/rate" 60 "google.golang.org/grpc" 61 "google.golang.org/grpc/codes" 62 "google.golang.org/grpc/status" 63 ) 64 65 const ( 66 // TODO since we introduced queuing in the TaskRunner, it is no longer 67 // easy to implement the capacity. Think of a better solution later. 68 // defaultRuntimeCapacity = 65536 69 defaultRuntimeIncomingQueueLen = 256 70 defaultRuntimeInitConcurrency = 256 71 defaultTaskPreDispatchRequestTTL = 10 * time.Second 72 defaultDiscoveryAutoSyncInterval = 5 * time.Second 73 ) 74 75 // Server is an executor server. 76 type Server struct { 77 cfg *Config 78 79 tcpServer tcpserver.TCPServer 80 grpcSrv *grpc.Server 81 masterClient pkgClient.ServerMasterClient 82 executorGroup *pkgClient.DefaultExecutorGroup 83 taskRunner *worker.TaskRunner 84 taskCommitter *worker.TaskCommitter 85 msgServer *p2p.MessageRPCService 86 selfID model.ExecutorID 87 88 lastHearbeatTime time.Time 89 90 mockSrv mock.GrpcServer 91 92 metastores server.MetastoreManager 93 94 p2pMsgRouter p2pImpl.MessageRouter 95 resourceBroker broker.Broker 96 jobAPISrv *jobAPIServer 97 } 98 99 // NewServer creates a new executor server instance 100 func NewServer(cfg *Config) *Server { 101 log.Info("creating executor", zap.Stringer("config", cfg)) 102 103 registerWorkerOnce.Do(registerWorkers) 104 s := Server{ 105 cfg: cfg, 106 jobAPISrv: newJobAPIServer(), 107 metastores: server.NewMetastoreManager(), 108 } 109 return &s 110 } 111 112 func (s *Server) buildDeps() (*deps.Deps, error) { 113 deps := deps.NewDeps() 114 err := deps.Provide(func() p2p.MessageHandlerManager { 115 return s.msgServer.MakeHandlerManager() 116 }) 117 if err != nil { 118 return nil, err 119 } 120 121 err = deps.Provide(func() p2p.MessageSender { 122 return p2p.NewMessageSender(s.p2pMsgRouter) 123 }) 124 if err != nil { 125 return nil, err 126 } 127 128 cli, err := pkgOrm.NewClient(s.metastores.FrameworkClientConn()) 129 if err != nil { 130 return nil, err 131 } 132 err = deps.Provide(func() pkgOrm.Client { 133 return cli 134 }) 135 if err != nil { 136 return nil, err 137 } 138 139 err = deps.Provide(func() metaModel.ClientConn { 140 return s.metastores.BusinessClientConn() 141 }) 142 if err != nil { 143 return nil, err 144 } 145 146 err = deps.Provide(func() pkgClient.ExecutorGroup { 147 return s.executorGroup 148 }) 149 if err != nil { 150 return nil, err 151 } 152 153 err = deps.Provide(func() pkgClient.ServerMasterClient { 154 return s.masterClient 155 }) 156 if err != nil { 157 return nil, err 158 } 159 160 err = deps.Provide(func() broker.Broker { 161 return s.resourceBroker 162 }) 163 if err != nil { 164 return nil, err 165 } 166 167 return deps, nil 168 } 169 170 func (s *Server) makeTask( 171 ctx context.Context, 172 projectInfo *pb.ProjectInfo, 173 workerID frameModel.WorkerID, 174 masterID frameModel.MasterID, 175 workerType frameModel.WorkerType, 176 workerConfig []byte, 177 workerEpoch frameModel.Epoch, 178 ) (worker.Runnable, error) { 179 dctx := dcontext.NewContext(ctx) 180 dp, err := s.buildDeps() 181 if err != nil { 182 return nil, err 183 } 184 dctx = dctx.WithDeps(dp) 185 dctx.Environ.NodeID = p2p.NodeID(s.selfID) 186 dctx.Environ.Addr = s.cfg.AdvertiseAddr 187 dctx.ProjectInfo = tenant.NewProjectInfo(projectInfo.GetTenantId(), projectInfo.GetProjectId()) 188 189 logger := frameLog.WithProjectInfo(logutil.FromContext(ctx), dctx.ProjectInfo) 190 logutil.NewContextWithLogger(dctx, logger) 191 192 // NOTICE: only take effect when job type is job master 193 masterMeta := &frameModel.MasterMeta{ 194 ProjectID: dctx.ProjectInfo.UniqueID(), 195 ID: workerID, 196 Type: workerType, 197 Config: workerConfig, 198 } 199 metaBytes, err := masterMeta.Marshal() 200 if err != nil { 201 return nil, err 202 } 203 dctx.Environ.MasterMetaBytes = metaBytes 204 205 globalRegistry := registry.GlobalWorkerRegistry() 206 newWorker, err := globalRegistry.CreateWorker( 207 dctx, 208 workerType, 209 workerID, 210 masterID, 211 workerConfig, 212 workerEpoch, 213 ) 214 if err != nil { 215 log.Error("Failed to create worker", zap.Error(err)) 216 return nil, err 217 } 218 if _, ok := newWorker.(framework.BaseJobMaster); ok { 219 err := precheckMasterMeta(dctx, globalRegistry, workerID, workerType) 220 if err != nil { 221 return nil, err 222 } 223 } 224 if jm, ok := newWorker.(framework.BaseJobMasterExt); ok { 225 jobID := newWorker.ID() 226 s.jobAPISrv.initialize(jobID, jm.TriggerOpenAPIInitialize) 227 } 228 229 return taskutil.WrapWorker(newWorker), nil 230 } 231 232 // precheckMasterMeta checks job master metadata before running it, stop task 233 // creating if job master has met a business unretryable error. 234 // Return error means meets failure in this function or job creation should be 235 // terminated. 236 func precheckMasterMeta( 237 dctx *dcontext.Context, 238 register registry.Registry, 239 id frameModel.MasterID, 240 tp frameModel.WorkerType, 241 ) error { 242 var param struct { 243 dig.In 244 FrameMetaClient pkgOrm.Client 245 } 246 if err := dctx.Deps().Fill(¶m); err != nil { 247 log.Panic("failed to fill dependencies", zap.Error(err)) 248 } 249 meta, err := param.FrameMetaClient.GetJobByID(dctx, id) 250 if err != nil { 251 return err 252 } 253 if meta.ErrorMsg == "" { 254 return nil 255 } 256 errInMeta := errors.New(meta.ErrorMsg) 257 retryable, err := checkBusinessErrorIsRetryable(register, errInMeta, tp) 258 if err != nil { 259 return err 260 } else if !retryable { 261 return errInMeta 262 } 263 return nil 264 } 265 266 // convertMakeTaskErrorToRPCError converts an error returned from `makeTask` to 267 // a gRPC friendly error. 268 func convertMakeTaskErrorToRPCError( 269 register registry.Registry, err error, tp frameModel.WorkerType, 270 ) error { 271 if errors.Is(err, errors.ErrCreateWorkerTerminate) { 272 return err 273 } 274 275 retryable, inErr := checkBusinessErrorIsRetryable(register, err, tp) 276 if inErr != nil { 277 return inErr 278 } 279 if retryable { 280 return errors.ErrCreateWorkerNonTerminate.Wrap(err).GenWithStackByArgs() 281 } 282 return errors.ErrCreateWorkerTerminate.Wrap(err).GenWithStackByArgs() 283 } 284 285 // checkBusinessErrorIsRetryable converts raw error to business error if possible, and 286 // checks whether this error is retryable from the perspective of business logic. 287 func checkBusinessErrorIsRetryable( 288 register registry.Registry, err error, tp frameModel.WorkerType, 289 ) (retryable bool, retErr error) { 290 err = errorutil.ConvertErr(tp, err) 291 return register.IsRetryableError(err, tp) 292 } 293 294 // PreDispatchTask implements Executor.PreDispatchTask 295 func (s *Server) PreDispatchTask(ctx context.Context, req *pb.PreDispatchTaskRequest) (*pb.PreDispatchTaskResponse, error) { 296 if !s.isReadyToServe() { 297 return nil, status.Error(codes.Unavailable, "executor server is not ready") 298 } 299 300 workerType := frameModel.WorkerType(req.GetTaskTypeId()) 301 task, err := s.makeTask( 302 ctx, 303 req.GetProjectInfo(), 304 req.GetWorkerId(), 305 req.GetMasterId(), 306 workerType, 307 req.GetTaskConfig(), 308 req.GetWorkerEpoch(), 309 ) 310 if err != nil { 311 return nil, convertMakeTaskErrorToRPCError(registry.GlobalWorkerRegistry(), err, workerType) 312 } 313 314 if !s.taskCommitter.PreDispatchTask(req.GetRequestId(), task) { 315 // The TaskCommitter failed to accept the task. 316 // Currently, the only reason is duplicate requestID. 317 return nil, status.Error(codes.AlreadyExists, "Duplicate request ID") 318 } 319 320 return &pb.PreDispatchTaskResponse{}, nil 321 } 322 323 // ConfirmDispatchTask implements Executor.ConfirmDispatchTask 324 func (s *Server) ConfirmDispatchTask(ctx context.Context, req *pb.ConfirmDispatchTaskRequest) (*pb.ConfirmDispatchTaskResponse, error) { 325 if !s.isReadyToServe() { 326 return nil, status.Error(codes.Unavailable, "executor server is not ready") 327 } 328 329 ok, err := s.taskCommitter.ConfirmDispatchTask(req.GetRequestId(), req.GetWorkerId()) 330 if err != nil { 331 return nil, err 332 } 333 if !ok { 334 return nil, errors.ErrDispatchTaskRequestIDNotFound.GenWithStackByArgs(req.GetRequestId()) 335 } 336 return &pb.ConfirmDispatchTaskResponse{}, nil 337 } 338 339 // Stop stops all running goroutines and releases resources in Server 340 func (s *Server) Stop() { 341 if s.grpcSrv != nil { 342 s.grpcSrv.Stop() 343 } 344 345 if s.tcpServer != nil { 346 err := s.tcpServer.Close() 347 if err != nil { 348 log.L().Error("close tcp server", zap.Error(err)) 349 } 350 } 351 352 if s.metastores.IsInitialized() { 353 s.metastores.Close() 354 } 355 356 if s.mockSrv != nil { 357 s.mockSrv.Stop() 358 } 359 360 // TODO: unregister self from master. 361 } 362 363 func (s *Server) startMsgService(ctx context.Context, wg *errgroup.Group) (err error) { 364 s.msgServer, err = p2p.NewDependentMessageRPCService(string(s.selfID), nil, s.grpcSrv) 365 if err != nil { 366 return err 367 } 368 wg.Go(func() error { 369 // TODO refactor this 370 return s.msgServer.Serve(ctx, nil) 371 }) 372 return nil 373 } 374 375 func (s *Server) isReadyToServe() bool { 376 return s.metastores.IsInitialized() 377 } 378 379 // Run drives server logic in independent background goroutines, and use error 380 // group to collect errors. 381 func (s *Server) Run(ctx context.Context) error { 382 if s.cfg.EnableGCTuning { 383 limit, err := memory.MemTotal() 384 if err != nil { 385 log.Warn("get memory failed", zap.Error(err)) 386 limit = 0 387 } 388 threshold := limit * 7 / 10 389 log.Info("set memory threshold to GC tuner", 390 zap.Uint64("memory limit", limit), 391 zap.Uint64("threshold", threshold)) 392 gctuner.EnableGOGCTuner.Store(true) 393 gctuner.SetMinGCPercent(20) 394 gctuner.Tuning(threshold) 395 } 396 397 wg, ctx := errgroup.WithContext(ctx) 398 s.taskRunner = worker.NewTaskRunner(defaultRuntimeIncomingQueueLen, defaultRuntimeInitConcurrency) 399 s.taskCommitter = worker.NewTaskCommitter(s.taskRunner, defaultTaskPreDispatchRequestTTL) 400 defer func() { 401 s.taskCommitter.Close() 402 }() 403 404 wg.Go(func() error { 405 return s.taskRunner.Run(ctx) 406 }) 407 408 wg.Go(func() error { 409 taskStopReceiver := s.taskRunner.TaskStopReceiver() 410 defer taskStopReceiver.Close() 411 return s.jobAPISrv.listenStoppedJobs(ctx, taskStopReceiver.C) 412 }) 413 414 err := s.initClients() 415 if err != nil { 416 return err 417 } 418 err = s.selfRegister(ctx) 419 if err != nil { 420 return err 421 } 422 423 s.resourceBroker, err = broker.NewBroker(ctx, s.selfID, s.masterClient) 424 if err != nil { 425 return err 426 } 427 defer s.resourceBroker.Close() 428 429 s.p2pMsgRouter = p2p.NewMessageRouter(p2p.NodeID(s.selfID), s.cfg.AdvertiseAddr) 430 431 s.grpcSrv = grpc.NewServer( 432 grpc.StreamInterceptor(grpcprometheus.StreamServerInterceptor), 433 grpc.ChainUnaryInterceptor( 434 grpcprometheus.UnaryServerInterceptor, 435 rpcutil.NormalizeError(), 436 ), 437 ) 438 err = s.startMsgService(ctx, wg) 439 if err != nil { 440 return err 441 } 442 443 err = s.startTCPService(ctx, wg) 444 if err != nil { 445 return err 446 } 447 448 if err := s.metastores.Init(ctx, s.masterClient); err != nil { 449 log.L().Error("Failed to init metastores", zap.Error(err)) 450 return err 451 } 452 453 discoveryAgent := discovery.NewAgent(s.masterClient, defaultDiscoveryAutoSyncInterval) 454 wg.Go(func() error { 455 return discoveryAgent.Run(ctx) 456 }) 457 458 wg.Go(func() error { 459 snap, receiver, err := discoveryAgent.Subscribe(ctx) 460 if err != nil { 461 return err 462 } 463 464 for _, node := range snap { 465 log.Debug("update p2p msg router by snapshot", zap.Any("node", node)) 466 s.p2pMsgRouter.AddPeer(node.ID, node.Addr) 467 } 468 469 for { 470 var event discovery.Event 471 select { 472 case <-ctx.Done(): 473 return errors.Trace(err) 474 case event = <-receiver.C: 475 } 476 477 log.Debug("update p2p msg router", zap.Any("event", event)) 478 if event.Tp == discovery.EventTypeDel { 479 s.p2pMsgRouter.RemovePeer(event.Node.ID) 480 } else if event.Tp == discovery.EventTypeAdd { 481 s.p2pMsgRouter.AddPeer(event.Node.ID, event.Node.Addr) 482 } 483 } 484 }) 485 486 wg.Go(func() error { 487 snap, receiver, err := discoveryAgent.Subscribe(ctx) 488 if err != nil { 489 return err 490 } 491 defer receiver.Close() 492 493 for _, node := range snap { 494 if node.Tp != discovery.NodeTypeExecutor { 495 continue 496 } 497 498 log.Debug("update executor client group by snapshot", zap.Any("node", node)) 499 err := s.executorGroup.AddExecutor(model.ExecutorID(node.ID), node.Addr) 500 if err != nil { 501 return err 502 } 503 } 504 505 for { 506 var event discovery.Event 507 select { 508 case <-ctx.Done(): 509 return errors.Trace(err) 510 case event = <-receiver.C: 511 } 512 513 if event.Node.Tp != discovery.NodeTypeExecutor { 514 continue 515 } 516 517 log.Debug("update executor client group", zap.Any("event", event)) 518 if event.Tp == discovery.EventTypeDel { 519 err := s.executorGroup.RemoveExecutor(model.ExecutorID(event.Node.ID)) 520 if err != nil { 521 return err 522 } 523 } else if event.Tp == discovery.EventTypeAdd { 524 err := s.executorGroup.AddExecutor(model.ExecutorID(event.Node.ID), event.Node.Addr) 525 if err != nil { 526 return err 527 } 528 } 529 } 530 }) 531 532 wg.Go(func() error { 533 return s.keepHeartbeat(ctx) 534 }) 535 536 wg.Go(func() error { 537 return s.reportTaskResc(ctx) 538 }) 539 540 wg.Go(func() error { 541 return s.bgUpdateServerMasterClients(ctx) 542 }) 543 544 wg.Go(func() error { 545 return s.collectMetricLoop(ctx, defaultMetricInterval) 546 }) 547 548 return wg.Wait() 549 } 550 551 // startTCPService starts grpc server and http server 552 func (s *Server) startTCPService(ctx context.Context, wg *errgroup.Group) error { 553 tcpServer, err := tcpserver.NewTCPServer(s.cfg.Addr, &security.Credential{}) 554 if err != nil { 555 return err 556 } 557 s.tcpServer = tcpServer 558 pb.RegisterExecutorServiceServer(s.grpcSrv, s) 559 pb.RegisterBrokerServiceServer(s.grpcSrv, s.resourceBroker) 560 log.Info("listen address", zap.String("addr", s.cfg.Addr)) 561 562 wg.Go(func() error { 563 return s.tcpServer.Run(ctx) 564 }) 565 566 wg.Go(func() error { 567 return s.grpcSrv.Serve(s.tcpServer.GrpcListener()) 568 }) 569 570 wg.Go(func() error { 571 mux := http.NewServeMux() 572 573 mux.HandleFunc("/debug/pprof/", pprof.Index) 574 mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline) 575 mux.HandleFunc("/debug/pprof/profile", pprof.Profile) 576 mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol) 577 mux.HandleFunc("/debug/pprof/trace", pprof.Trace) 578 mux.Handle("/metrics", promutil.HTTPHandlerForMetric()) 579 mux.Handle(openapi.JobAPIPrefix, s.jobAPISrv) 580 581 httpSrv := &http.Server{ 582 Handler: mux, 583 ReadHeaderTimeout: time.Minute, 584 } 585 err := httpSrv.Serve(s.tcpServer.HTTP1Listener()) 586 if err != nil && !common.IsErrNetClosing(err) && err != http.ErrServerClosed { 587 log.L().Error("http server returned", logutil.ShortError(err)) 588 } 589 return err 590 }) 591 return nil 592 } 593 594 func (s *Server) initClients() (err error) { 595 // initServerMasterList is a MasterServerList with all servers marked as followers. 596 initServerMasterList := getInitServerMasterList(s.cfg.Join) 597 // TODO support TLS 598 s.masterClient, err = pkgClient.NewServerMasterClientWithFailOver(initServerMasterList, nil) 599 if err != nil { 600 log.L().Info("master client init Failed", 601 zap.String("server-addrs", s.cfg.Join), 602 logutil.ShortError(err)) 603 return err 604 } 605 log.L().Info("master client init successful", 606 zap.String("server-addrs", s.cfg.Join)) 607 608 s.executorGroup = pkgClient.NewExecutorGroup(nil, log.L()) 609 return nil 610 } 611 612 func (s *Server) selfRegister(ctx context.Context) error { 613 registerReq := &pb.RegisterExecutorRequest{ 614 Executor: &pb.Executor{ 615 Name: s.cfg.Name, 616 Address: s.cfg.AdvertiseAddr, 617 Labels: s.cfg.Labels, 618 }, 619 } 620 executorID, err := s.masterClient.RegisterExecutor(ctx, registerReq) 621 if err != nil { 622 return err 623 } 624 625 s.selfID = executorID 626 log.L().Info("register successful", zap.String("executor-id", string(executorID))) 627 return nil 628 } 629 630 // TODO: Right now heartbeat maintainable is too simple. We should look into 631 // what other frameworks do or whether we can use grpc heartbeat. 632 func (s *Server) keepHeartbeat(ctx context.Context) error { 633 ticker := time.NewTicker(s.cfg.KeepAliveInterval) 634 s.lastHearbeatTime = time.Now() 635 rl := rate.NewLimiter(rate.Every(time.Second*5), 1 /*burst*/) 636 for { 637 select { 638 case <-ctx.Done(): 639 return nil 640 case t := <-ticker.C: 641 if s.lastHearbeatTime.Add(s.cfg.KeepAliveTTL).Before(time.Now()) { 642 return errors.ErrHeartbeat.GenWithStack("timeout") 643 } 644 req := &pb.HeartbeatRequest{ 645 ExecutorId: string(s.selfID), 646 Timestamp: uint64(t.Unix()), 647 // We set longer ttl for master, which is "ttl + rpc timeout", to avoid that 648 // executor actually wait for a timeout when ttl is nearly up. 649 Ttl: uint64(s.cfg.KeepAliveTTL.Milliseconds() + s.cfg.RPCTimeout.Milliseconds()), 650 } 651 _, err := s.masterClient.Heartbeat(ctx, req) 652 if err != nil { 653 if errors.Is(err, errors.ErrMasterNotReady) { 654 s.lastHearbeatTime = t 655 if rl.Allow() { 656 log.L().Info("heartbeat success with MasterNotReady") 657 } 658 continue 659 } 660 661 log.Warn("heartbeat rpc meet error", zap.Error(err)) 662 if errors.Is(err, errors.ErrTombstoneExecutor) { 663 return errors.ErrHeartbeat.GenWithStack("logic error: %v", err) 664 } 665 666 if s.lastHearbeatTime.Add(s.cfg.KeepAliveTTL).Before(time.Now()) { 667 return errors.WrapError(errors.ErrHeartbeat, err, "timeout") 668 } 669 continue 670 } 671 672 // We aim to keep lastHbTime of executor consistent with lastHbTime of Master. 673 // If we set the heartbeat time of executor to the start time of rpc, it will 674 // be a little bit earlier than the heartbeat time of master, which is safe. 675 // In contrast, if we set it to the end time of rpc, it might be a little bit 676 // later than master's, which might cause that master wait for less time than executor. 677 // This gap is unsafe. 678 s.lastHearbeatTime = t 679 if rl.Allow() { 680 log.L().Info("heartbeat success") 681 } 682 } 683 } 684 } 685 686 func getJoinURLs(addrs string) []string { 687 return strings.Split(addrs, ",") 688 } 689 690 // getInitServerMasterList returns a MasterServerList with 691 // all servers marked as the follower. 692 func getInitServerMasterList(addrs string) pkgClient.MasterServerList { 693 ret := make(pkgClient.MasterServerList, len(addrs)) 694 for _, addr := range getJoinURLs(addrs) { 695 ret[addr] = false // Mark no leader 696 } 697 return ret 698 } 699 700 func (s *Server) reportTaskRescOnce(ctx context.Context) error { 701 // TODO: do we need to report allocated resource to master? 702 // TODO: Implement task-wise workload reporting in TaskRunner. 703 /* 704 rescs := s.workerRtm.Workload() 705 req := &pb.ExecWorkloadRequest{ 706 // TODO: use which field as ExecutorId is more accurate 707 ExecutorId: s.cfg.WorkerAddr, 708 Workloads: make([]*pb.ExecWorkload, 0, len(rescs)), 709 } 710 for tp, resc := range rescs { 711 req.Workloads = append(req.Workloads, &pb.ExecWorkload{ 712 Tp: pb.JobType(tp), 713 Usage: int32(resc), 714 }) 715 } 716 resp, err := s.masterClient.ReportExecutorWorkload(ctx, req) 717 if err != nil { 718 return err 719 } 720 if resp.Err != nil { 721 log.Warn("report executor workload error", zap.String("err", resp.Err.String())) 722 } 723 */ 724 return nil 725 } 726 727 // reportTaskResc reports tasks resource usage to resource manager periodically 728 func (s *Server) reportTaskResc(ctx context.Context) error { 729 ticker := time.NewTicker(time.Second * 10) 730 defer ticker.Stop() 731 for { 732 select { 733 case <-ctx.Done(): 734 return nil 735 case <-ticker.C: 736 err := s.reportTaskRescOnce(ctx) 737 if err != nil { 738 return err 739 } 740 } 741 } 742 } 743 744 func (s *Server) bgUpdateServerMasterClients(ctx context.Context) error { 745 for { 746 select { 747 case <-ctx.Done(): 748 return errors.Trace(ctx.Err()) 749 case <-time.After(defaultDiscoveryAutoSyncInterval): 750 masters, err := s.masterClient.ListMasters(ctx) 751 if err != nil { 752 log.Warn("update master list error", zap.Error(err)) 753 continue 754 } 755 masterList := make(pkgClient.MasterServerList) 756 for _, m := range masters { 757 masterList[m.Address] = m.IsLeader 758 } 759 if failoverCli, ok := s.masterClient.(*pkgClient.ServerMasterClientWithFailOver); ok { 760 failoverCli.UpdateServerList(masterList) 761 } 762 } 763 } 764 } 765 766 func (s *Server) collectMetricLoop(ctx context.Context, tickInterval time.Duration) error { 767 metricRunningTask := executorTaskNumGauge.WithLabelValues("running") 768 ticker := time.NewTicker(tickInterval) 769 defer ticker.Stop() 770 for { 771 select { 772 case <-ctx.Done(): 773 return nil 774 case <-ticker.C: 775 metricRunningTask.Set(float64(s.taskRunner.TaskCount())) 776 } 777 } 778 }