github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/executor/server.go (about)

     1  // Copyright 2022 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package executor
    15  
    16  import (
    17  	"context"
    18  	"net/http"
    19  	"net/http/pprof"
    20  	"strings"
    21  	"time"
    22  
    23  	grpcprometheus "github.com/grpc-ecosystem/go-grpc-prometheus"
    24  	"github.com/pingcap/log"
    25  	"github.com/pingcap/tidb/pkg/util/gctuner"
    26  	"github.com/pingcap/tidb/pkg/util/memory"
    27  	"github.com/pingcap/tiflow/dm/common"
    28  	pb "github.com/pingcap/tiflow/engine/enginepb"
    29  	"github.com/pingcap/tiflow/engine/executor/server"
    30  	"github.com/pingcap/tiflow/engine/executor/worker"
    31  	"github.com/pingcap/tiflow/engine/framework"
    32  	frameLog "github.com/pingcap/tiflow/engine/framework/logutil"
    33  	frameModel "github.com/pingcap/tiflow/engine/framework/model"
    34  	"github.com/pingcap/tiflow/engine/framework/registry"
    35  	"github.com/pingcap/tiflow/engine/framework/taskutil"
    36  	"github.com/pingcap/tiflow/engine/internal/pkg/discovery"
    37  	"github.com/pingcap/tiflow/engine/model"
    38  	pkgClient "github.com/pingcap/tiflow/engine/pkg/client"
    39  	dcontext "github.com/pingcap/tiflow/engine/pkg/context"
    40  	"github.com/pingcap/tiflow/engine/pkg/deps"
    41  	"github.com/pingcap/tiflow/engine/pkg/externalresource/broker"
    42  	metaModel "github.com/pingcap/tiflow/engine/pkg/meta/model"
    43  	"github.com/pingcap/tiflow/engine/pkg/openapi"
    44  	pkgOrm "github.com/pingcap/tiflow/engine/pkg/orm"
    45  	"github.com/pingcap/tiflow/engine/pkg/p2p"
    46  	"github.com/pingcap/tiflow/engine/pkg/promutil"
    47  	"github.com/pingcap/tiflow/engine/pkg/rpcutil"
    48  	"github.com/pingcap/tiflow/engine/pkg/tenant"
    49  	"github.com/pingcap/tiflow/engine/test/mock"
    50  	"github.com/pingcap/tiflow/pkg/errors"
    51  	"github.com/pingcap/tiflow/pkg/errorutil"
    52  	"github.com/pingcap/tiflow/pkg/logutil"
    53  	p2pImpl "github.com/pingcap/tiflow/pkg/p2p"
    54  	"github.com/pingcap/tiflow/pkg/security"
    55  	"github.com/pingcap/tiflow/pkg/tcpserver"
    56  	"go.uber.org/dig"
    57  	"go.uber.org/zap"
    58  	"golang.org/x/sync/errgroup"
    59  	"golang.org/x/time/rate"
    60  	"google.golang.org/grpc"
    61  	"google.golang.org/grpc/codes"
    62  	"google.golang.org/grpc/status"
    63  )
    64  
    65  const (
    66  	// TODO since we introduced queuing in the TaskRunner, it is no longer
    67  	// easy to implement the capacity. Think of a better solution later.
    68  	// defaultRuntimeCapacity      = 65536
    69  	defaultRuntimeIncomingQueueLen   = 256
    70  	defaultRuntimeInitConcurrency    = 256
    71  	defaultTaskPreDispatchRequestTTL = 10 * time.Second
    72  	defaultDiscoveryAutoSyncInterval = 5 * time.Second
    73  )
    74  
    75  // Server is an executor server.
    76  type Server struct {
    77  	cfg *Config
    78  
    79  	tcpServer     tcpserver.TCPServer
    80  	grpcSrv       *grpc.Server
    81  	masterClient  pkgClient.ServerMasterClient
    82  	executorGroup *pkgClient.DefaultExecutorGroup
    83  	taskRunner    *worker.TaskRunner
    84  	taskCommitter *worker.TaskCommitter
    85  	msgServer     *p2p.MessageRPCService
    86  	selfID        model.ExecutorID
    87  
    88  	lastHearbeatTime time.Time
    89  
    90  	mockSrv mock.GrpcServer
    91  
    92  	metastores server.MetastoreManager
    93  
    94  	p2pMsgRouter   p2pImpl.MessageRouter
    95  	resourceBroker broker.Broker
    96  	jobAPISrv      *jobAPIServer
    97  }
    98  
    99  // NewServer creates a new executor server instance
   100  func NewServer(cfg *Config) *Server {
   101  	log.Info("creating executor", zap.Stringer("config", cfg))
   102  
   103  	registerWorkerOnce.Do(registerWorkers)
   104  	s := Server{
   105  		cfg:        cfg,
   106  		jobAPISrv:  newJobAPIServer(),
   107  		metastores: server.NewMetastoreManager(),
   108  	}
   109  	return &s
   110  }
   111  
   112  func (s *Server) buildDeps() (*deps.Deps, error) {
   113  	deps := deps.NewDeps()
   114  	err := deps.Provide(func() p2p.MessageHandlerManager {
   115  		return s.msgServer.MakeHandlerManager()
   116  	})
   117  	if err != nil {
   118  		return nil, err
   119  	}
   120  
   121  	err = deps.Provide(func() p2p.MessageSender {
   122  		return p2p.NewMessageSender(s.p2pMsgRouter)
   123  	})
   124  	if err != nil {
   125  		return nil, err
   126  	}
   127  
   128  	cli, err := pkgOrm.NewClient(s.metastores.FrameworkClientConn())
   129  	if err != nil {
   130  		return nil, err
   131  	}
   132  	err = deps.Provide(func() pkgOrm.Client {
   133  		return cli
   134  	})
   135  	if err != nil {
   136  		return nil, err
   137  	}
   138  
   139  	err = deps.Provide(func() metaModel.ClientConn {
   140  		return s.metastores.BusinessClientConn()
   141  	})
   142  	if err != nil {
   143  		return nil, err
   144  	}
   145  
   146  	err = deps.Provide(func() pkgClient.ExecutorGroup {
   147  		return s.executorGroup
   148  	})
   149  	if err != nil {
   150  		return nil, err
   151  	}
   152  
   153  	err = deps.Provide(func() pkgClient.ServerMasterClient {
   154  		return s.masterClient
   155  	})
   156  	if err != nil {
   157  		return nil, err
   158  	}
   159  
   160  	err = deps.Provide(func() broker.Broker {
   161  		return s.resourceBroker
   162  	})
   163  	if err != nil {
   164  		return nil, err
   165  	}
   166  
   167  	return deps, nil
   168  }
   169  
   170  func (s *Server) makeTask(
   171  	ctx context.Context,
   172  	projectInfo *pb.ProjectInfo,
   173  	workerID frameModel.WorkerID,
   174  	masterID frameModel.MasterID,
   175  	workerType frameModel.WorkerType,
   176  	workerConfig []byte,
   177  	workerEpoch frameModel.Epoch,
   178  ) (worker.Runnable, error) {
   179  	dctx := dcontext.NewContext(ctx)
   180  	dp, err := s.buildDeps()
   181  	if err != nil {
   182  		return nil, err
   183  	}
   184  	dctx = dctx.WithDeps(dp)
   185  	dctx.Environ.NodeID = p2p.NodeID(s.selfID)
   186  	dctx.Environ.Addr = s.cfg.AdvertiseAddr
   187  	dctx.ProjectInfo = tenant.NewProjectInfo(projectInfo.GetTenantId(), projectInfo.GetProjectId())
   188  
   189  	logger := frameLog.WithProjectInfo(logutil.FromContext(ctx), dctx.ProjectInfo)
   190  	logutil.NewContextWithLogger(dctx, logger)
   191  
   192  	// NOTICE: only take effect when job type is job master
   193  	masterMeta := &frameModel.MasterMeta{
   194  		ProjectID: dctx.ProjectInfo.UniqueID(),
   195  		ID:        workerID,
   196  		Type:      workerType,
   197  		Config:    workerConfig,
   198  	}
   199  	metaBytes, err := masterMeta.Marshal()
   200  	if err != nil {
   201  		return nil, err
   202  	}
   203  	dctx.Environ.MasterMetaBytes = metaBytes
   204  
   205  	globalRegistry := registry.GlobalWorkerRegistry()
   206  	newWorker, err := globalRegistry.CreateWorker(
   207  		dctx,
   208  		workerType,
   209  		workerID,
   210  		masterID,
   211  		workerConfig,
   212  		workerEpoch,
   213  	)
   214  	if err != nil {
   215  		log.Error("Failed to create worker", zap.Error(err))
   216  		return nil, err
   217  	}
   218  	if _, ok := newWorker.(framework.BaseJobMaster); ok {
   219  		err := precheckMasterMeta(dctx, globalRegistry, workerID, workerType)
   220  		if err != nil {
   221  			return nil, err
   222  		}
   223  	}
   224  	if jm, ok := newWorker.(framework.BaseJobMasterExt); ok {
   225  		jobID := newWorker.ID()
   226  		s.jobAPISrv.initialize(jobID, jm.TriggerOpenAPIInitialize)
   227  	}
   228  
   229  	return taskutil.WrapWorker(newWorker), nil
   230  }
   231  
   232  // precheckMasterMeta checks job master metadata before running it, stop task
   233  // creating if job master has met a business unretryable error.
   234  // Return error means meets failure in this function or job creation should be
   235  // terminated.
   236  func precheckMasterMeta(
   237  	dctx *dcontext.Context,
   238  	register registry.Registry,
   239  	id frameModel.MasterID,
   240  	tp frameModel.WorkerType,
   241  ) error {
   242  	var param struct {
   243  		dig.In
   244  		FrameMetaClient pkgOrm.Client
   245  	}
   246  	if err := dctx.Deps().Fill(&param); err != nil {
   247  		log.Panic("failed to fill dependencies", zap.Error(err))
   248  	}
   249  	meta, err := param.FrameMetaClient.GetJobByID(dctx, id)
   250  	if err != nil {
   251  		return err
   252  	}
   253  	if meta.ErrorMsg == "" {
   254  		return nil
   255  	}
   256  	errInMeta := errors.New(meta.ErrorMsg)
   257  	retryable, err := checkBusinessErrorIsRetryable(register, errInMeta, tp)
   258  	if err != nil {
   259  		return err
   260  	} else if !retryable {
   261  		return errInMeta
   262  	}
   263  	return nil
   264  }
   265  
   266  // convertMakeTaskErrorToRPCError converts an error returned from `makeTask` to
   267  // a gRPC friendly error.
   268  func convertMakeTaskErrorToRPCError(
   269  	register registry.Registry, err error, tp frameModel.WorkerType,
   270  ) error {
   271  	if errors.Is(err, errors.ErrCreateWorkerTerminate) {
   272  		return err
   273  	}
   274  
   275  	retryable, inErr := checkBusinessErrorIsRetryable(register, err, tp)
   276  	if inErr != nil {
   277  		return inErr
   278  	}
   279  	if retryable {
   280  		return errors.ErrCreateWorkerNonTerminate.Wrap(err).GenWithStackByArgs()
   281  	}
   282  	return errors.ErrCreateWorkerTerminate.Wrap(err).GenWithStackByArgs()
   283  }
   284  
   285  // checkBusinessErrorIsRetryable converts raw error to business error if possible, and
   286  // checks whether this error is retryable from the perspective of business logic.
   287  func checkBusinessErrorIsRetryable(
   288  	register registry.Registry, err error, tp frameModel.WorkerType,
   289  ) (retryable bool, retErr error) {
   290  	err = errorutil.ConvertErr(tp, err)
   291  	return register.IsRetryableError(err, tp)
   292  }
   293  
   294  // PreDispatchTask implements Executor.PreDispatchTask
   295  func (s *Server) PreDispatchTask(ctx context.Context, req *pb.PreDispatchTaskRequest) (*pb.PreDispatchTaskResponse, error) {
   296  	if !s.isReadyToServe() {
   297  		return nil, status.Error(codes.Unavailable, "executor server is not ready")
   298  	}
   299  
   300  	workerType := frameModel.WorkerType(req.GetTaskTypeId())
   301  	task, err := s.makeTask(
   302  		ctx,
   303  		req.GetProjectInfo(),
   304  		req.GetWorkerId(),
   305  		req.GetMasterId(),
   306  		workerType,
   307  		req.GetTaskConfig(),
   308  		req.GetWorkerEpoch(),
   309  	)
   310  	if err != nil {
   311  		return nil, convertMakeTaskErrorToRPCError(registry.GlobalWorkerRegistry(), err, workerType)
   312  	}
   313  
   314  	if !s.taskCommitter.PreDispatchTask(req.GetRequestId(), task) {
   315  		// The TaskCommitter failed to accept the task.
   316  		// Currently, the only reason is duplicate requestID.
   317  		return nil, status.Error(codes.AlreadyExists, "Duplicate request ID")
   318  	}
   319  
   320  	return &pb.PreDispatchTaskResponse{}, nil
   321  }
   322  
   323  // ConfirmDispatchTask implements Executor.ConfirmDispatchTask
   324  func (s *Server) ConfirmDispatchTask(ctx context.Context, req *pb.ConfirmDispatchTaskRequest) (*pb.ConfirmDispatchTaskResponse, error) {
   325  	if !s.isReadyToServe() {
   326  		return nil, status.Error(codes.Unavailable, "executor server is not ready")
   327  	}
   328  
   329  	ok, err := s.taskCommitter.ConfirmDispatchTask(req.GetRequestId(), req.GetWorkerId())
   330  	if err != nil {
   331  		return nil, err
   332  	}
   333  	if !ok {
   334  		return nil, errors.ErrDispatchTaskRequestIDNotFound.GenWithStackByArgs(req.GetRequestId())
   335  	}
   336  	return &pb.ConfirmDispatchTaskResponse{}, nil
   337  }
   338  
   339  // Stop stops all running goroutines and releases resources in Server
   340  func (s *Server) Stop() {
   341  	if s.grpcSrv != nil {
   342  		s.grpcSrv.Stop()
   343  	}
   344  
   345  	if s.tcpServer != nil {
   346  		err := s.tcpServer.Close()
   347  		if err != nil {
   348  			log.L().Error("close tcp server", zap.Error(err))
   349  		}
   350  	}
   351  
   352  	if s.metastores.IsInitialized() {
   353  		s.metastores.Close()
   354  	}
   355  
   356  	if s.mockSrv != nil {
   357  		s.mockSrv.Stop()
   358  	}
   359  
   360  	// TODO: unregister self from master.
   361  }
   362  
   363  func (s *Server) startMsgService(ctx context.Context, wg *errgroup.Group) (err error) {
   364  	s.msgServer, err = p2p.NewDependentMessageRPCService(string(s.selfID), nil, s.grpcSrv)
   365  	if err != nil {
   366  		return err
   367  	}
   368  	wg.Go(func() error {
   369  		// TODO refactor this
   370  		return s.msgServer.Serve(ctx, nil)
   371  	})
   372  	return nil
   373  }
   374  
   375  func (s *Server) isReadyToServe() bool {
   376  	return s.metastores.IsInitialized()
   377  }
   378  
   379  // Run drives server logic in independent background goroutines, and use error
   380  // group to collect errors.
   381  func (s *Server) Run(ctx context.Context) error {
   382  	if s.cfg.EnableGCTuning {
   383  		limit, err := memory.MemTotal()
   384  		if err != nil {
   385  			log.Warn("get memory failed", zap.Error(err))
   386  			limit = 0
   387  		}
   388  		threshold := limit * 7 / 10
   389  		log.Info("set memory threshold to GC tuner",
   390  			zap.Uint64("memory limit", limit),
   391  			zap.Uint64("threshold", threshold))
   392  		gctuner.EnableGOGCTuner.Store(true)
   393  		gctuner.SetMinGCPercent(20)
   394  		gctuner.Tuning(threshold)
   395  	}
   396  
   397  	wg, ctx := errgroup.WithContext(ctx)
   398  	s.taskRunner = worker.NewTaskRunner(defaultRuntimeIncomingQueueLen, defaultRuntimeInitConcurrency)
   399  	s.taskCommitter = worker.NewTaskCommitter(s.taskRunner, defaultTaskPreDispatchRequestTTL)
   400  	defer func() {
   401  		s.taskCommitter.Close()
   402  	}()
   403  
   404  	wg.Go(func() error {
   405  		return s.taskRunner.Run(ctx)
   406  	})
   407  
   408  	wg.Go(func() error {
   409  		taskStopReceiver := s.taskRunner.TaskStopReceiver()
   410  		defer taskStopReceiver.Close()
   411  		return s.jobAPISrv.listenStoppedJobs(ctx, taskStopReceiver.C)
   412  	})
   413  
   414  	err := s.initClients()
   415  	if err != nil {
   416  		return err
   417  	}
   418  	err = s.selfRegister(ctx)
   419  	if err != nil {
   420  		return err
   421  	}
   422  
   423  	s.resourceBroker, err = broker.NewBroker(ctx, s.selfID, s.masterClient)
   424  	if err != nil {
   425  		return err
   426  	}
   427  	defer s.resourceBroker.Close()
   428  
   429  	s.p2pMsgRouter = p2p.NewMessageRouter(p2p.NodeID(s.selfID), s.cfg.AdvertiseAddr)
   430  
   431  	s.grpcSrv = grpc.NewServer(
   432  		grpc.StreamInterceptor(grpcprometheus.StreamServerInterceptor),
   433  		grpc.ChainUnaryInterceptor(
   434  			grpcprometheus.UnaryServerInterceptor,
   435  			rpcutil.NormalizeError(),
   436  		),
   437  	)
   438  	err = s.startMsgService(ctx, wg)
   439  	if err != nil {
   440  		return err
   441  	}
   442  
   443  	err = s.startTCPService(ctx, wg)
   444  	if err != nil {
   445  		return err
   446  	}
   447  
   448  	if err := s.metastores.Init(ctx, s.masterClient); err != nil {
   449  		log.L().Error("Failed to init metastores", zap.Error(err))
   450  		return err
   451  	}
   452  
   453  	discoveryAgent := discovery.NewAgent(s.masterClient, defaultDiscoveryAutoSyncInterval)
   454  	wg.Go(func() error {
   455  		return discoveryAgent.Run(ctx)
   456  	})
   457  
   458  	wg.Go(func() error {
   459  		snap, receiver, err := discoveryAgent.Subscribe(ctx)
   460  		if err != nil {
   461  			return err
   462  		}
   463  
   464  		for _, node := range snap {
   465  			log.Debug("update p2p msg router by snapshot", zap.Any("node", node))
   466  			s.p2pMsgRouter.AddPeer(node.ID, node.Addr)
   467  		}
   468  
   469  		for {
   470  			var event discovery.Event
   471  			select {
   472  			case <-ctx.Done():
   473  				return errors.Trace(err)
   474  			case event = <-receiver.C:
   475  			}
   476  
   477  			log.Debug("update p2p msg router", zap.Any("event", event))
   478  			if event.Tp == discovery.EventTypeDel {
   479  				s.p2pMsgRouter.RemovePeer(event.Node.ID)
   480  			} else if event.Tp == discovery.EventTypeAdd {
   481  				s.p2pMsgRouter.AddPeer(event.Node.ID, event.Node.Addr)
   482  			}
   483  		}
   484  	})
   485  
   486  	wg.Go(func() error {
   487  		snap, receiver, err := discoveryAgent.Subscribe(ctx)
   488  		if err != nil {
   489  			return err
   490  		}
   491  		defer receiver.Close()
   492  
   493  		for _, node := range snap {
   494  			if node.Tp != discovery.NodeTypeExecutor {
   495  				continue
   496  			}
   497  
   498  			log.Debug("update executor client group by snapshot", zap.Any("node", node))
   499  			err := s.executorGroup.AddExecutor(model.ExecutorID(node.ID), node.Addr)
   500  			if err != nil {
   501  				return err
   502  			}
   503  		}
   504  
   505  		for {
   506  			var event discovery.Event
   507  			select {
   508  			case <-ctx.Done():
   509  				return errors.Trace(err)
   510  			case event = <-receiver.C:
   511  			}
   512  
   513  			if event.Node.Tp != discovery.NodeTypeExecutor {
   514  				continue
   515  			}
   516  
   517  			log.Debug("update executor client group", zap.Any("event", event))
   518  			if event.Tp == discovery.EventTypeDel {
   519  				err := s.executorGroup.RemoveExecutor(model.ExecutorID(event.Node.ID))
   520  				if err != nil {
   521  					return err
   522  				}
   523  			} else if event.Tp == discovery.EventTypeAdd {
   524  				err := s.executorGroup.AddExecutor(model.ExecutorID(event.Node.ID), event.Node.Addr)
   525  				if err != nil {
   526  					return err
   527  				}
   528  			}
   529  		}
   530  	})
   531  
   532  	wg.Go(func() error {
   533  		return s.keepHeartbeat(ctx)
   534  	})
   535  
   536  	wg.Go(func() error {
   537  		return s.reportTaskResc(ctx)
   538  	})
   539  
   540  	wg.Go(func() error {
   541  		return s.bgUpdateServerMasterClients(ctx)
   542  	})
   543  
   544  	wg.Go(func() error {
   545  		return s.collectMetricLoop(ctx, defaultMetricInterval)
   546  	})
   547  
   548  	return wg.Wait()
   549  }
   550  
   551  // startTCPService starts grpc server and http server
   552  func (s *Server) startTCPService(ctx context.Context, wg *errgroup.Group) error {
   553  	tcpServer, err := tcpserver.NewTCPServer(s.cfg.Addr, &security.Credential{})
   554  	if err != nil {
   555  		return err
   556  	}
   557  	s.tcpServer = tcpServer
   558  	pb.RegisterExecutorServiceServer(s.grpcSrv, s)
   559  	pb.RegisterBrokerServiceServer(s.grpcSrv, s.resourceBroker)
   560  	log.Info("listen address", zap.String("addr", s.cfg.Addr))
   561  
   562  	wg.Go(func() error {
   563  		return s.tcpServer.Run(ctx)
   564  	})
   565  
   566  	wg.Go(func() error {
   567  		return s.grpcSrv.Serve(s.tcpServer.GrpcListener())
   568  	})
   569  
   570  	wg.Go(func() error {
   571  		mux := http.NewServeMux()
   572  
   573  		mux.HandleFunc("/debug/pprof/", pprof.Index)
   574  		mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline)
   575  		mux.HandleFunc("/debug/pprof/profile", pprof.Profile)
   576  		mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol)
   577  		mux.HandleFunc("/debug/pprof/trace", pprof.Trace)
   578  		mux.Handle("/metrics", promutil.HTTPHandlerForMetric())
   579  		mux.Handle(openapi.JobAPIPrefix, s.jobAPISrv)
   580  
   581  		httpSrv := &http.Server{
   582  			Handler:           mux,
   583  			ReadHeaderTimeout: time.Minute,
   584  		}
   585  		err := httpSrv.Serve(s.tcpServer.HTTP1Listener())
   586  		if err != nil && !common.IsErrNetClosing(err) && err != http.ErrServerClosed {
   587  			log.L().Error("http server returned", logutil.ShortError(err))
   588  		}
   589  		return err
   590  	})
   591  	return nil
   592  }
   593  
   594  func (s *Server) initClients() (err error) {
   595  	// initServerMasterList is a MasterServerList with all servers marked as followers.
   596  	initServerMasterList := getInitServerMasterList(s.cfg.Join)
   597  	// TODO support TLS
   598  	s.masterClient, err = pkgClient.NewServerMasterClientWithFailOver(initServerMasterList, nil)
   599  	if err != nil {
   600  		log.L().Info("master client init Failed",
   601  			zap.String("server-addrs", s.cfg.Join),
   602  			logutil.ShortError(err))
   603  		return err
   604  	}
   605  	log.L().Info("master client init successful",
   606  		zap.String("server-addrs", s.cfg.Join))
   607  
   608  	s.executorGroup = pkgClient.NewExecutorGroup(nil, log.L())
   609  	return nil
   610  }
   611  
   612  func (s *Server) selfRegister(ctx context.Context) error {
   613  	registerReq := &pb.RegisterExecutorRequest{
   614  		Executor: &pb.Executor{
   615  			Name:    s.cfg.Name,
   616  			Address: s.cfg.AdvertiseAddr,
   617  			Labels:  s.cfg.Labels,
   618  		},
   619  	}
   620  	executorID, err := s.masterClient.RegisterExecutor(ctx, registerReq)
   621  	if err != nil {
   622  		return err
   623  	}
   624  
   625  	s.selfID = executorID
   626  	log.L().Info("register successful", zap.String("executor-id", string(executorID)))
   627  	return nil
   628  }
   629  
   630  // TODO: Right now heartbeat maintainable is too simple. We should look into
   631  // what other frameworks do or whether we can use grpc heartbeat.
   632  func (s *Server) keepHeartbeat(ctx context.Context) error {
   633  	ticker := time.NewTicker(s.cfg.KeepAliveInterval)
   634  	s.lastHearbeatTime = time.Now()
   635  	rl := rate.NewLimiter(rate.Every(time.Second*5), 1 /*burst*/)
   636  	for {
   637  		select {
   638  		case <-ctx.Done():
   639  			return nil
   640  		case t := <-ticker.C:
   641  			if s.lastHearbeatTime.Add(s.cfg.KeepAliveTTL).Before(time.Now()) {
   642  				return errors.ErrHeartbeat.GenWithStack("timeout")
   643  			}
   644  			req := &pb.HeartbeatRequest{
   645  				ExecutorId: string(s.selfID),
   646  				Timestamp:  uint64(t.Unix()),
   647  				// We set longer ttl for master, which is "ttl + rpc timeout", to avoid that
   648  				// executor actually wait for a timeout when ttl is nearly up.
   649  				Ttl: uint64(s.cfg.KeepAliveTTL.Milliseconds() + s.cfg.RPCTimeout.Milliseconds()),
   650  			}
   651  			_, err := s.masterClient.Heartbeat(ctx, req)
   652  			if err != nil {
   653  				if errors.Is(err, errors.ErrMasterNotReady) {
   654  					s.lastHearbeatTime = t
   655  					if rl.Allow() {
   656  						log.L().Info("heartbeat success with MasterNotReady")
   657  					}
   658  					continue
   659  				}
   660  
   661  				log.Warn("heartbeat rpc meet error", zap.Error(err))
   662  				if errors.Is(err, errors.ErrTombstoneExecutor) {
   663  					return errors.ErrHeartbeat.GenWithStack("logic error: %v", err)
   664  				}
   665  
   666  				if s.lastHearbeatTime.Add(s.cfg.KeepAliveTTL).Before(time.Now()) {
   667  					return errors.WrapError(errors.ErrHeartbeat, err, "timeout")
   668  				}
   669  				continue
   670  			}
   671  
   672  			// We aim to keep lastHbTime of executor consistent with lastHbTime of Master.
   673  			// If we set the heartbeat time of executor to the start time of rpc, it will
   674  			// be a little bit earlier than the heartbeat time of master, which is safe.
   675  			// In contrast, if we set it to the end time of rpc, it might be a little bit
   676  			// later than master's, which might cause that master wait for less time than executor.
   677  			// This gap is unsafe.
   678  			s.lastHearbeatTime = t
   679  			if rl.Allow() {
   680  				log.L().Info("heartbeat success")
   681  			}
   682  		}
   683  	}
   684  }
   685  
   686  func getJoinURLs(addrs string) []string {
   687  	return strings.Split(addrs, ",")
   688  }
   689  
   690  // getInitServerMasterList returns a MasterServerList with
   691  // all servers marked as the follower.
   692  func getInitServerMasterList(addrs string) pkgClient.MasterServerList {
   693  	ret := make(pkgClient.MasterServerList, len(addrs))
   694  	for _, addr := range getJoinURLs(addrs) {
   695  		ret[addr] = false // Mark no leader
   696  	}
   697  	return ret
   698  }
   699  
   700  func (s *Server) reportTaskRescOnce(ctx context.Context) error {
   701  	// TODO: do we need to report allocated resource to master?
   702  	// TODO: Implement task-wise workload reporting in TaskRunner.
   703  	/*
   704  		rescs := s.workerRtm.Workload()
   705  		req := &pb.ExecWorkloadRequest{
   706  			// TODO: use which field as ExecutorId is more accurate
   707  			ExecutorId: s.cfg.WorkerAddr,
   708  			Workloads:  make([]*pb.ExecWorkload, 0, len(rescs)),
   709  		}
   710  		for tp, resc := range rescs {
   711  			req.Workloads = append(req.Workloads, &pb.ExecWorkload{
   712  				Tp:    pb.JobType(tp),
   713  				Usage: int32(resc),
   714  			})
   715  		}
   716  		resp, err := s.masterClient.ReportExecutorWorkload(ctx, req)
   717  		if err != nil {
   718  			return err
   719  		}
   720  		if resp.Err != nil {
   721  			log.Warn("report executor workload error", zap.String("err", resp.Err.String()))
   722  		}
   723  	*/
   724  	return nil
   725  }
   726  
   727  // reportTaskResc reports tasks resource usage to resource manager periodically
   728  func (s *Server) reportTaskResc(ctx context.Context) error {
   729  	ticker := time.NewTicker(time.Second * 10)
   730  	defer ticker.Stop()
   731  	for {
   732  		select {
   733  		case <-ctx.Done():
   734  			return nil
   735  		case <-ticker.C:
   736  			err := s.reportTaskRescOnce(ctx)
   737  			if err != nil {
   738  				return err
   739  			}
   740  		}
   741  	}
   742  }
   743  
   744  func (s *Server) bgUpdateServerMasterClients(ctx context.Context) error {
   745  	for {
   746  		select {
   747  		case <-ctx.Done():
   748  			return errors.Trace(ctx.Err())
   749  		case <-time.After(defaultDiscoveryAutoSyncInterval):
   750  			masters, err := s.masterClient.ListMasters(ctx)
   751  			if err != nil {
   752  				log.Warn("update master list error", zap.Error(err))
   753  				continue
   754  			}
   755  			masterList := make(pkgClient.MasterServerList)
   756  			for _, m := range masters {
   757  				masterList[m.Address] = m.IsLeader
   758  			}
   759  			if failoverCli, ok := s.masterClient.(*pkgClient.ServerMasterClientWithFailOver); ok {
   760  				failoverCli.UpdateServerList(masterList)
   761  			}
   762  		}
   763  	}
   764  }
   765  
   766  func (s *Server) collectMetricLoop(ctx context.Context, tickInterval time.Duration) error {
   767  	metricRunningTask := executorTaskNumGauge.WithLabelValues("running")
   768  	ticker := time.NewTicker(tickInterval)
   769  	defer ticker.Stop()
   770  	for {
   771  		select {
   772  		case <-ctx.Done():
   773  			return nil
   774  		case <-ticker.C:
   775  			metricRunningTask.Set(float64(s.taskRunner.TaskCount()))
   776  		}
   777  	}
   778  }