github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/servermaster/jobmanager.go (about)

     1  // Copyright 2022 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package servermaster
    15  
    16  import (
    17  	"context"
    18  	"encoding/json"
    19  	"regexp"
    20  	"sort"
    21  	"time"
    22  
    23  	"github.com/pingcap/log"
    24  	pb "github.com/pingcap/tiflow/engine/enginepb"
    25  	"github.com/pingcap/tiflow/engine/executor/cvs"
    26  	"github.com/pingcap/tiflow/engine/framework"
    27  	"github.com/pingcap/tiflow/engine/framework/metadata"
    28  	frameModel "github.com/pingcap/tiflow/engine/framework/model"
    29  	engineModel "github.com/pingcap/tiflow/engine/model"
    30  	"github.com/pingcap/tiflow/engine/pkg/clock"
    31  	dcontext "github.com/pingcap/tiflow/engine/pkg/context"
    32  	"github.com/pingcap/tiflow/engine/pkg/ctxmu"
    33  	resManager "github.com/pingcap/tiflow/engine/pkg/externalresource/manager"
    34  	engineHTTPUtil "github.com/pingcap/tiflow/engine/pkg/httputil"
    35  	"github.com/pingcap/tiflow/engine/pkg/notifier"
    36  	pkgOrm "github.com/pingcap/tiflow/engine/pkg/orm"
    37  	"github.com/pingcap/tiflow/engine/pkg/p2p"
    38  	"github.com/pingcap/tiflow/engine/pkg/tenant"
    39  	"github.com/pingcap/tiflow/engine/servermaster/jobop"
    40  	schedModel "github.com/pingcap/tiflow/engine/servermaster/scheduler/model"
    41  	"github.com/pingcap/tiflow/pkg/errors"
    42  	"github.com/pingcap/tiflow/pkg/httputil"
    43  	"github.com/pingcap/tiflow/pkg/label"
    44  	"github.com/pingcap/tiflow/pkg/notify"
    45  	"github.com/pingcap/tiflow/pkg/uuid"
    46  	"go.uber.org/zap"
    47  	"golang.org/x/sync/errgroup"
    48  	"google.golang.org/grpc/codes"
    49  	"google.golang.org/grpc/status"
    50  	"google.golang.org/protobuf/types/known/emptypb"
    51  )
    52  
    53  // JobManager defines manager of job master
    54  type JobManager interface {
    55  	framework.Master
    56  	JobStats
    57  	pb.JobManagerServer
    58  
    59  	GetJobMasterForwardAddress(ctx context.Context, jobID string) (string, error)
    60  	GetJobStatuses(ctx context.Context) (map[frameModel.MasterID]frameModel.MasterState, error)
    61  	UpdateJobStatus(ctx context.Context, jobID frameModel.MasterID, errMsg string, code frameModel.MasterState) error
    62  	WatchJobStatuses(
    63  		ctx context.Context,
    64  	) (resManager.JobStatusesSnapshot, *notifier.Receiver[resManager.JobStatusChangeEvent], error)
    65  }
    66  
    67  const (
    68  	defaultJobMasterCost = 1
    69  	jobOperateInterval   = time.Second * 15
    70  	defaultHTTPTimeout   = time.Second * 10
    71  	defaultListPageSize  = 100
    72  	maxListPageSize      = 1000
    73  )
    74  
    75  var jobIDRegex = regexp.MustCompile(`^\w([-.\w]{0,61}\w)?$`)
    76  
    77  // JobManagerImpl is a special job master that manages all the job masters, and notify the offline executor to them.
    78  // worker state transition
    79  // - submit new job, create job master successfully, then adds to the `waitAckJobs`.
    80  // - receive worker online, move job from `waitAckJobs` to `onlineJobs`.
    81  // - receive worker offline, move job from `onlineJobs` to `pendingJobs`.
    82  // - Tick checks `pendingJobs` periodically	and reschedules the jobs.
    83  type JobManagerImpl struct {
    84  	framework.BaseMaster
    85  	*JobFsm
    86  
    87  	masterMetaClient    *metadata.MasterMetadataClient
    88  	uuidGen             uuid.Generator
    89  	clocker             clock.Clock
    90  	frameMetaClient     pkgOrm.Client
    91  	tombstoneCleaned    bool
    92  	jobOperator         jobop.JobOperator
    93  	jobOperatorNotifier *notify.Notifier
    94  	JobBackoffMgr       jobop.BackoffManager
    95  
    96  	// jobStatusChangeMu must be taken when we try to create, delete,
    97  	// pause or resume a job.
    98  	// NOTE The concurrency management for the JobManager is not complete
    99  	// yet. We are prioritizing implementing all features.
   100  	// TODO We might add a pending operation queue in the future.
   101  	jobStatusChangeMu *ctxmu.CtxMutex
   102  	notifier          *notifier.Notifier[resManager.JobStatusChangeEvent]
   103  	wg                *errgroup.Group
   104  
   105  	// http client for the job detail
   106  	jobHTTPClient engineHTTPUtil.JobHTTPClient
   107  }
   108  
   109  // CancelJob implements JobManagerServer.CancelJob.
   110  func (jm *JobManagerImpl) CancelJob(ctx context.Context, req *pb.CancelJobRequest) (*pb.Job, error) {
   111  	meta, err := jm.frameMetaClient.GetJobByID(ctx, req.Id)
   112  	if err != nil {
   113  		if pkgOrm.IsNotFoundError(err) {
   114  			return nil, errors.ErrJobNotFound.GenWithStackByArgs(req.Id)
   115  		}
   116  		return nil, err
   117  	}
   118  
   119  	pbJob, err := buildPBJob(meta, false /* includeConfig */)
   120  	if err != nil {
   121  		return nil, err
   122  	}
   123  	if isJobTerminated(meta.State) {
   124  		return pbJob, nil
   125  	}
   126  
   127  	if err := jm.jobOperator.MarkJobCanceling(ctx, req.Id); err != nil {
   128  		return nil, err
   129  	}
   130  	jm.jobOperatorNotifier.Notify()
   131  	pbJob.State = pb.Job_Canceling
   132  	return pbJob, nil
   133  }
   134  
   135  // SendCancelJobMessage implements operateRouter.SendCancelJobMessage
   136  func (jm *JobManagerImpl) SendCancelJobMessage(ctx context.Context, jobID string) error {
   137  	job := jm.JobFsm.QueryOnlineJob(jobID)
   138  	if job == nil {
   139  		if _, err := jm.frameMetaClient.GetJobByID(ctx, jobID); pkgOrm.IsNotFoundError(err) {
   140  			return errors.ErrJobNotFound.GenWithStackByArgs(jobID)
   141  		}
   142  		return errors.ErrJobNotRunning.GenWithStackByArgs(jobID)
   143  	}
   144  
   145  	topic := frameModel.WorkerStatusChangeRequestTopic(jm.BaseMaster.MasterID(), job.WorkerHandle().ID())
   146  	msg := &frameModel.StatusChangeRequest{
   147  		SendTime:     jm.clocker.Mono(),
   148  		FromMasterID: jm.BaseMaster.MasterID(),
   149  		Epoch:        jm.BaseMaster.MasterMeta().Epoch,
   150  		ExpectState:  frameModel.WorkerStateStopped,
   151  	}
   152  	handle := job.WorkerHandle().Unwrap()
   153  	if handle == nil {
   154  		return errors.ErrJobNotRunning.GenWithStackByArgs(jobID)
   155  	}
   156  	return handle.SendMessage(ctx, topic, msg, true /*nonblocking*/)
   157  }
   158  
   159  // DeleteJob implements JobManagerServer.DeleteJob.
   160  func (jm *JobManagerImpl) DeleteJob(ctx context.Context, req *pb.DeleteJobRequest) (*emptypb.Empty, error) {
   161  	masterMeta, err := jm.frameMetaClient.GetJobByID(ctx, req.Id)
   162  	if err != nil {
   163  		if pkgOrm.IsNotFoundError(err) {
   164  			return nil, errors.ErrJobNotFound.GenWithStackByArgs(req.Id)
   165  		}
   166  		return nil, err
   167  	}
   168  
   169  	// Only terminated jobs can be deleted.
   170  	if !isJobTerminated(masterMeta.State) {
   171  		return nil, errors.ErrJobNotTerminated.GenWithStackByArgs(req.Id)
   172  	}
   173  
   174  	if err := jm.deleteJobMeta(ctx, req.Id); err != nil {
   175  		return nil, err
   176  	}
   177  	return &emptypb.Empty{}, nil
   178  }
   179  
   180  func isJobTerminated(state frameModel.MasterState) bool {
   181  	switch state {
   182  	case frameModel.MasterStateFinished, frameModel.MasterStateStopped, frameModel.MasterStateFailed:
   183  		return true
   184  	default:
   185  		return false
   186  	}
   187  }
   188  
   189  func (jm *JobManagerImpl) deleteJobMeta(ctx context.Context, jobID string) error {
   190  	ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
   191  	defer cancel()
   192  
   193  	if ok := jm.jobStatusChangeMu.Lock(ctx); !ok {
   194  		return errors.Trace(ctx.Err())
   195  	}
   196  	defer jm.jobStatusChangeMu.Unlock()
   197  
   198  	// Note that DeleteJob is a soft delete.
   199  	res, err := jm.frameMetaClient.DeleteJob(ctx, jobID)
   200  	if err != nil {
   201  		return err
   202  	}
   203  	if res.RowsAffected() == 0 {
   204  		log.Warn("Job not found in meta (or already deleted)",
   205  			zap.Any("job-id", jobID))
   206  	}
   207  
   208  	jm.notifier.Notify(resManager.JobStatusChangeEvent{
   209  		EventType: resManager.JobRemovedEvent,
   210  		JobID:     jobID,
   211  	})
   212  	return nil
   213  }
   214  
   215  // GetJob implements JobManagerServer.GetJob.
   216  func (jm *JobManagerImpl) GetJob(ctx context.Context, req *pb.GetJobRequest) (*pb.Job, error) {
   217  	masterMeta, err := jm.frameMetaClient.GetJobByID(ctx, req.Id)
   218  	if err != nil {
   219  		if pkgOrm.IsNotFoundError(err) {
   220  			return nil, errors.ErrJobNotFound.GenWithStackByArgs(req.Id)
   221  		}
   222  		return nil, err
   223  	}
   224  
   225  	job, err := buildPBJob(masterMeta, req.IncludeConfig)
   226  	if err != nil {
   227  		return nil, err
   228  	}
   229  	jm.tryQueryJobDetail(ctx, masterMeta.Addr, job)
   230  
   231  	return job, nil
   232  }
   233  
   234  // CreateJob implements JobManagerServer.CreateJob.
   235  func (jm *JobManagerImpl) CreateJob(ctx context.Context, req *pb.CreateJobRequest) (*pb.Job, error) {
   236  	if err := validateCreateJobRequest(req); err != nil {
   237  		return nil, err
   238  	}
   239  
   240  	selectors, err := convertSelectors(req)
   241  	if err != nil {
   242  		return nil, err
   243  	}
   244  
   245  	// TODO call jm.notifier.Notify when we want to support "add job" event.
   246  	log.Info("create job", zap.Any("job", req.Job),
   247  		zap.String("tenant_id", req.TenantId), zap.String("project_id", req.ProjectId))
   248  
   249  	job := req.Job
   250  	if job.Id == "" {
   251  		job.Id = jm.uuidGen.NewString()
   252  	}
   253  
   254  	meta := &frameModel.MasterMeta{
   255  		ProjectID: tenant.NewProjectInfo(
   256  			req.TenantId,
   257  			req.ProjectId,
   258  		).UniqueID(),
   259  		ID:     job.Id,
   260  		Config: job.Config,
   261  		State:  frameModel.MasterStateUninit,
   262  		Ext: frameModel.MasterMetaExt{
   263  			Selectors: selectors,
   264  		},
   265  	}
   266  	switch job.Type {
   267  	case pb.Job_CVSDemo:
   268  		extConfig := &cvs.Config{}
   269  		if err := json.Unmarshal(job.Config, extConfig); err != nil {
   270  			return nil, status.Errorf(codes.InvalidArgument, "failed to decode config: %v", err)
   271  		}
   272  		meta.Type = frameModel.CvsJobMaster
   273  	case pb.Job_DM:
   274  		meta.Type = frameModel.DMJobMaster
   275  	case pb.Job_FakeJob:
   276  		meta.Type = frameModel.FakeJobMaster
   277  	default:
   278  		return nil, status.Errorf(codes.InvalidArgument, "job type %v is not supported", job.Type)
   279  	}
   280  
   281  	// create job master metadata before creating it.
   282  	if err := jm.frameMetaClient.InsertJob(ctx, meta); err != nil {
   283  		if pkgOrm.IsDuplicateEntryError(err) {
   284  			return nil, errors.ErrJobAlreadyExists.GenWithStackByArgs(job.Id)
   285  		}
   286  		return nil, err
   287  	}
   288  
   289  	// TODO: Refine me. split the BaseMaster
   290  	defaultMaster, ok := jm.BaseMaster.(interface {
   291  		SetProjectInfo(frameModel.MasterID, tenant.ProjectInfo)
   292  	})
   293  	if ok {
   294  		defaultMaster.SetProjectInfo(meta.ID, tenant.NewProjectInfo(req.TenantId, req.ProjectId))
   295  	} else {
   296  		log.Error("jobmanager don't have the 'SetProjectInfo' interface",
   297  			zap.String("masterID", meta.ID),
   298  			zap.Any("projectInfo", tenant.NewProjectInfo(req.TenantId, req.ProjectId)))
   299  	}
   300  
   301  	// CreateWorker here is to create job master actually
   302  	// TODO: use correct worker cost
   303  	workerID, err := jm.frameworkCreateWorker(meta)
   304  	if err != nil {
   305  		err2 := metadata.DeleteMasterMeta(ctx, jm.frameMetaClient, meta.ID)
   306  		if err2 != nil {
   307  			// TODO: add more GC mechanism if master meta is failed to delete
   308  			log.Error("failed to delete master meta", zap.Error(err2))
   309  		}
   310  
   311  		log.Error("create job master met error", zap.Error(err))
   312  		return nil, err
   313  	}
   314  
   315  	if workerID != job.Id {
   316  		log.Panic("job id is not equal to worker id of job master", zap.String("job-id", job.Id), zap.String("worker-id", workerID))
   317  	}
   318  	jm.JobFsm.JobDispatched(meta, false /*addFromFailover*/)
   319  
   320  	return buildPBJob(meta, false /* includeConfig */)
   321  }
   322  
   323  func validateCreateJobRequest(req *pb.CreateJobRequest) error {
   324  	if req.Job == nil {
   325  		return status.Error(codes.InvalidArgument, "job must not be nil")
   326  	}
   327  	if req.Job.Id != "" && !jobIDRegex.MatchString(req.Job.Id) {
   328  		return status.Errorf(codes.InvalidArgument, "job id must match %s", jobIDRegex.String())
   329  	}
   330  	if req.Job.Type == pb.Job_TypeUnknown {
   331  		return status.Error(codes.InvalidArgument, "job type must be specified")
   332  	}
   333  	return nil
   334  }
   335  
   336  func convertSelectors(req *pb.CreateJobRequest) ([]*label.Selector, error) {
   337  	if len(req.GetJob().Selectors) == 0 {
   338  		return nil, nil
   339  	}
   340  
   341  	ret := make([]*label.Selector, 0, len(req.GetJob().Selectors))
   342  	for _, pbSel := range req.Job.Selectors {
   343  		sel, err := schedModel.SelectorFromPB(pbSel)
   344  		if err != nil {
   345  			return nil, err
   346  		}
   347  		if err := sel.Validate(); err != nil {
   348  			return nil, err
   349  		}
   350  		ret = append(ret, sel)
   351  	}
   352  	return ret, nil
   353  }
   354  
   355  // ListJobs implements JobManagerServer.ListJobs.
   356  func (jm *JobManagerImpl) ListJobs(ctx context.Context, req *pb.ListJobsRequest) (*pb.ListJobsResponse, error) {
   357  	masterMetas, err := jm.frameMetaClient.QueryJobs(ctx)
   358  	if err != nil {
   359  		return nil, err
   360  	}
   361  
   362  	sort.Slice(masterMetas, func(i, j int) bool {
   363  		return masterMetas[i].ID < masterMetas[j].ID
   364  	})
   365  
   366  	firstIdx := sort.Search(len(masterMetas), func(i int) bool {
   367  		return masterMetas[i].ID > req.PageToken
   368  	})
   369  
   370  	pageSize := req.PageSize
   371  	if pageSize <= 0 {
   372  		pageSize = defaultListPageSize
   373  	} else if pageSize > maxListPageSize {
   374  		pageSize = maxListPageSize
   375  	}
   376  
   377  	resp := &pb.ListJobsResponse{}
   378  	for i := firstIdx; i < len(masterMetas); i++ {
   379  		if masterMetas[i].Type == frameModel.JobManager {
   380  			continue
   381  		}
   382  
   383  		job, err := buildPBJob(masterMetas[i], req.IncludeConfig)
   384  		if err != nil {
   385  			return nil, err
   386  		}
   387  		if req.Type != pb.Job_TypeUnknown && job.Type != req.Type {
   388  			continue
   389  		}
   390  		if req.State != pb.Job_StateUnknown && job.State != req.State {
   391  			continue
   392  		}
   393  		jm.tryQueryJobDetail(ctx, masterMetas[i].Addr, job)
   394  
   395  		resp.Jobs = append(resp.Jobs, job)
   396  		// Retrieve one more job to determine whether there is a next page.
   397  		if int32(len(resp.Jobs)) >= pageSize+1 {
   398  			break
   399  		}
   400  	}
   401  
   402  	if len(resp.Jobs) > int(pageSize) {
   403  		resp.Jobs = resp.Jobs[:pageSize]
   404  		resp.NextPageToken = resp.Jobs[pageSize-1].Id
   405  	}
   406  	return resp, nil
   407  }
   408  
   409  func (jm *JobManagerImpl) tryQueryJobDetail(ctx context.Context, jobMasterAddr string, job *pb.Job) {
   410  	// If job is not running, we can't query job detail from jobmaster.
   411  	if job.State != pb.Job_Running || jm.JobFsm.QueryOnlineJob(job.Id) == nil {
   412  		return
   413  	}
   414  	detail, httpErr := jm.jobHTTPClient.GetJobDetail(ctx, jobMasterAddr, job.Id)
   415  	if httpErr != nil {
   416  		job.Error = &pb.Job_Error{
   417  			Code:    httpErr.Code,
   418  			Message: httpErr.Message,
   419  		}
   420  	} else {
   421  		job.Detail = detail
   422  	}
   423  }
   424  
   425  func buildPBJob(masterMeta *frameModel.MasterMeta, includeConfig bool) (*pb.Job, error) {
   426  	var jobType pb.Job_Type
   427  	switch tp := framework.MustConvertWorkerType2JobType(masterMeta.Type); tp {
   428  	case engineModel.JobTypeCVSDemo:
   429  		jobType = pb.Job_CVSDemo
   430  	case engineModel.JobTypeDM:
   431  		jobType = pb.Job_DM
   432  	case engineModel.JobTypeCDC:
   433  		jobType = pb.Job_CDC
   434  	case engineModel.JobTypeFakeJob:
   435  		jobType = pb.Job_FakeJob
   436  	default:
   437  		return nil, errors.Errorf("job %s has unknown type %v", masterMeta.ID, masterMeta.Type)
   438  	}
   439  
   440  	var jobState pb.Job_State
   441  	switch masterMeta.State {
   442  	case frameModel.MasterStateUninit:
   443  		jobState = pb.Job_Created
   444  	case frameModel.MasterStateInit:
   445  		jobState = pb.Job_Running
   446  	case frameModel.MasterStateFinished:
   447  		jobState = pb.Job_Finished
   448  	case frameModel.MasterStateStopped:
   449  		jobState = pb.Job_Canceled
   450  	case frameModel.MasterStateFailed:
   451  		jobState = pb.Job_Failed
   452  	default:
   453  		return nil, errors.Errorf("job %s has unknown state %v", masterMeta.ID, masterMeta.State)
   454  	}
   455  
   456  	var selectors []*pb.Selector
   457  	for _, sel := range masterMeta.Ext.Selectors {
   458  		pbSel, err := schedModel.SelectorToPB(sel)
   459  		if err != nil {
   460  			return nil, errors.Annotate(err, "buildPBJob")
   461  		}
   462  		selectors = append(selectors, pbSel)
   463  	}
   464  	job := &pb.Job{
   465  		Id:     masterMeta.ID,
   466  		Type:   jobType,
   467  		State:  jobState,
   468  		Detail: masterMeta.Detail,
   469  		Error: &pb.Job_Error{
   470  			Message: masterMeta.ErrorMsg,
   471  		},
   472  		Selectors: selectors,
   473  	}
   474  	if includeConfig {
   475  		job.Config = masterMeta.Config
   476  	}
   477  	return job, nil
   478  }
   479  
   480  // GetJobMasterForwardAddress implements JobManager.GetJobMasterForwardAddress.
   481  func (jm *JobManagerImpl) GetJobMasterForwardAddress(ctx context.Context, jobID string) (string, error) {
   482  	// Always query from database. Master meta in JobFsm may be out of date.
   483  	masterMeta, err := jm.frameMetaClient.GetJobByID(ctx, jobID)
   484  	if err != nil {
   485  		if pkgOrm.IsNotFoundError(err) {
   486  			return "", errors.ErrJobNotFound.GenWithStackByArgs(jobID)
   487  		}
   488  		return "", err
   489  	}
   490  	if masterMeta.State != frameModel.MasterStateInit || jm.JobFsm.QueryOnlineJob(jobID) == nil {
   491  		return "", errors.ErrJobNotRunning.GenWithStackByArgs(jobID)
   492  	}
   493  	return masterMeta.Addr, nil
   494  }
   495  
   496  // GetJobStatuses returns the status code of all jobs that are not deleted.
   497  func (jm *JobManagerImpl) GetJobStatuses(
   498  	ctx context.Context,
   499  ) (map[frameModel.MasterID]frameModel.MasterState, error) {
   500  	// BUG? NO filter in the implement
   501  	jobs, err := jm.frameMetaClient.QueryJobs(ctx)
   502  	if err != nil {
   503  		return nil, err
   504  	}
   505  
   506  	ret := make(map[frameModel.MasterID]frameModel.MasterState, len(jobs))
   507  	for _, jobMeta := range jobs {
   508  		ret[jobMeta.ID] = jobMeta.State
   509  	}
   510  	return ret, nil
   511  }
   512  
   513  // UpdateJobStatus implements JobManager.UpdateJobStatus
   514  func (jm *JobManagerImpl) UpdateJobStatus(
   515  	ctx context.Context, jobID frameModel.MasterID, errMsg string, code frameModel.MasterState,
   516  ) error {
   517  	// Note since the job is not online, it is safe to get from metastore and then update
   518  	meta, err := jm.frameMetaClient.GetJobByID(ctx, jobID)
   519  	if err != nil {
   520  		return err
   521  	}
   522  	meta.ErrorMsg = errMsg
   523  	meta.State = code
   524  	return jm.frameMetaClient.UpsertJob(ctx, meta)
   525  }
   526  
   527  // NewJobManagerImpl creates a new JobManagerImpl instance
   528  func NewJobManagerImpl(
   529  	dctx *dcontext.Context,
   530  	id frameModel.MasterID,
   531  	backoffConfig *jobop.BackoffConfig,
   532  ) (*JobManagerImpl, error) {
   533  	metaCli, err := dctx.Deps().Construct(func(cli pkgOrm.Client) (pkgOrm.Client, error) {
   534  		return cli, nil
   535  	})
   536  	if err != nil {
   537  		return nil, err
   538  	}
   539  
   540  	metaClient := metaCli.(pkgOrm.Client)
   541  	cli := metadata.NewMasterMetadataClient(id, metaClient)
   542  
   543  	httpCli, err := httputil.NewClient(nil)
   544  	if err != nil {
   545  		return nil, err
   546  	}
   547  	httpCli.SetTimeout(defaultHTTPTimeout)
   548  
   549  	clocker := clock.New()
   550  	impl := &JobManagerImpl{
   551  		JobFsm:              NewJobFsm(),
   552  		uuidGen:             uuid.NewGenerator(),
   553  		masterMetaClient:    cli,
   554  		clocker:             clocker,
   555  		frameMetaClient:     metaClient,
   556  		jobStatusChangeMu:   ctxmu.New(),
   557  		notifier:            notifier.NewNotifier[resManager.JobStatusChangeEvent](),
   558  		jobOperatorNotifier: new(notify.Notifier),
   559  		jobHTTPClient:       engineHTTPUtil.NewJobHTTPClient(httpCli),
   560  		JobBackoffMgr:       jobop.NewBackoffManagerImpl(clocker, backoffConfig),
   561  	}
   562  	impl.BaseMaster = framework.NewBaseMaster(
   563  		dctx,
   564  		impl,
   565  		id,
   566  		frameModel.JobManager,
   567  	)
   568  	impl.jobOperator = jobop.NewJobOperatorImpl(metaClient, impl)
   569  	wg, ctx := errgroup.WithContext(dctx)
   570  	impl.wg = wg
   571  
   572  	// Note the meta data of job manager is not used, it is safe to overwrite it
   573  	// every time a new server master leader is elected. And we always mark the
   574  	// Initialized to true in order to trigger OnMasterRecovered of job manager.
   575  	meta := impl.MasterMeta()
   576  	meta.State = frameModel.MasterStateInit
   577  	err = metadata.StoreMasterMeta(ctx, impl.frameMetaClient, meta)
   578  	if err != nil {
   579  		return nil, err
   580  	}
   581  	err = impl.BaseMaster.Init(ctx)
   582  	if err != nil {
   583  		_ = impl.BaseMaster.Close(ctx)
   584  		return nil, err
   585  	}
   586  	impl.bgJobOperatorLoop(ctx)
   587  
   588  	return impl, err
   589  }
   590  
   591  // InitImpl implements frame.MasterImpl.InitImpl
   592  func (jm *JobManagerImpl) InitImpl(ctx context.Context) error {
   593  	return nil
   594  }
   595  
   596  // Tick implements frame.MasterImpl.Tick
   597  func (jm *JobManagerImpl) Tick(ctx context.Context) error {
   598  	filterQuotaError := func(err error) (exceedQuota bool, retErr error) {
   599  		if err == nil {
   600  			return false, nil
   601  		}
   602  		if errors.Is(err, errors.ErrMasterConcurrencyExceeded) {
   603  			log.Warn("create worker exceeds quota, retry later", zap.Error(err))
   604  			return true, nil
   605  		}
   606  		return false, err
   607  	}
   608  
   609  	err := jm.JobFsm.IterPendingJobs(
   610  		func(job *frameModel.MasterMeta) (string, error) {
   611  			isJobCanceling := jm.jobOperator.IsJobCanceling(ctx, job.ID)
   612  			if isJobCanceling || jm.JobBackoffMgr.Terminate(job.ID) {
   613  				state := frameModel.MasterStateFailed
   614  				if isJobCanceling {
   615  					state = frameModel.MasterStateStopped
   616  				}
   617  				if err := jm.terminateJob(ctx, job.ErrorMsg, job.ID, state); err != nil {
   618  					return "", err
   619  				}
   620  				return "", errors.ErrMasterCreateWorkerTerminate.FastGenByArgs()
   621  			}
   622  			if !jm.JobBackoffMgr.Allow(job.ID) {
   623  				return "", errors.ErrMasterCreateWorkerBackoff.FastGenByArgs()
   624  			}
   625  			return jm.frameworkCreateWorker(job)
   626  		})
   627  	if _, err = filterQuotaError(err); err != nil {
   628  		return err
   629  	}
   630  
   631  	if !jm.tombstoneCleaned && jm.BaseMaster.IsMasterReady() {
   632  		for _, worker := range jm.BaseMaster.GetWorkers() {
   633  			// clean tombstone workers from worker manager and they will be
   634  			// re-created in the following IterWaitAckJobs
   635  			tombstoneHandle := worker.GetTombstone()
   636  			if tombstoneHandle != nil {
   637  				if err := tombstoneHandle.CleanTombstone(ctx); err != nil {
   638  					return err
   639  				}
   640  				continue
   641  			}
   642  			// mark non-tombstone workers as online
   643  			err := jm.JobFsm.JobOnline(worker)
   644  			// ignore worker that is not in WaitAck list
   645  			if err != nil && !errors.Is(err, errors.ErrWorkerNotFound) {
   646  				return err
   647  			}
   648  		}
   649  		err = jm.JobFsm.IterWaitAckJobs(
   650  			func(job *frameModel.MasterMeta) (string, error) {
   651  				return jm.frameworkCreateWorker(job)
   652  			})
   653  		exceedQuota, err := filterQuotaError(err)
   654  		if err != nil {
   655  			return err
   656  		}
   657  		// if met exceed quota error, the remaining jobs need to be failover in
   658  		// another tick
   659  		if !exceedQuota {
   660  			jm.tombstoneCleaned = true
   661  		}
   662  	}
   663  
   664  	return nil
   665  }
   666  
   667  // OnMasterRecovered implements frame.MasterImpl.OnMasterRecovered
   668  func (jm *JobManagerImpl) OnMasterRecovered(ctx context.Context) error {
   669  	jobs, err := jm.masterMetaClient.LoadAllMasters(ctx)
   670  	if err != nil {
   671  		return err
   672  	}
   673  
   674  	// TODO: refine me, split the BaseMaster interface
   675  	impl, ok := jm.BaseMaster.(interface {
   676  		InitProjectInfosAfterRecover([]*frameModel.MasterMeta)
   677  	})
   678  	if !ok {
   679  		log.Panic("unfound interface for BaseMaster", zap.String("interface", "InitProjectInfosAfterRecover"))
   680  		return errors.ErrMasterInterfaceNotFound.GenWithStackByArgs()
   681  	}
   682  	impl.InitProjectInfosAfterRecover(jobs)
   683  
   684  	for _, job := range jobs {
   685  		if job.Type == frameModel.JobManager {
   686  			continue
   687  		}
   688  		// TODO: filter the job in backend
   689  		if job.State.IsTerminatedState() {
   690  			log.Info("skip job in terminated status", zap.Any("job", job))
   691  			continue
   692  		}
   693  		jm.JobFsm.JobDispatched(job, true /*addFromFailover*/)
   694  		log.Info("recover job, move it to WaitAck job queue", zap.Any("job", job))
   695  	}
   696  	return nil
   697  }
   698  
   699  // OnWorkerDispatched implements frame.MasterImpl.OnWorkerDispatched
   700  func (jm *JobManagerImpl) OnWorkerDispatched(worker framework.WorkerHandle, result error) error {
   701  	if result != nil {
   702  		if errors.Is(result, errors.ErrCreateWorkerTerminate) {
   703  			errMsg := result.Error()
   704  			if cause := errors.Cause(result); cause != nil {
   705  				errMsg = cause.Error()
   706  			}
   707  			if err := jm.terminateJob(
   708  				context.Background(), errMsg, worker.ID(), frameModel.MasterStateFailed,
   709  			); err != nil {
   710  				return err
   711  			}
   712  			jm.JobFsm.JobOffline(worker, false /* needFailover */)
   713  			return nil
   714  		}
   715  		log.Warn("dispatch worker met error", zap.Error(result))
   716  		jm.JobBackoffMgr.JobFail(worker.ID())
   717  		return jm.JobFsm.JobDispatchFailed(worker)
   718  	}
   719  	return nil
   720  }
   721  
   722  // OnWorkerOnline implements frame.MasterImpl.OnWorkerOnline
   723  func (jm *JobManagerImpl) OnWorkerOnline(worker framework.WorkerHandle) error {
   724  	log.Info("on worker online", zap.Any("id", worker.ID()))
   725  	jm.JobBackoffMgr.JobOnline(worker.ID())
   726  	return jm.JobFsm.JobOnline(worker)
   727  }
   728  
   729  // OnWorkerOffline implements frame.MasterImpl.OnWorkerOffline
   730  func (jm *JobManagerImpl) OnWorkerOffline(worker framework.WorkerHandle, reason error) error {
   731  	needFailover := true
   732  	if errors.Is(reason, errors.ErrWorkerFinish) {
   733  		log.Info("job master finished", zap.String("id", worker.ID()))
   734  		needFailover = false
   735  	} else if errors.Is(reason, errors.ErrWorkerCancel) {
   736  		log.Info("job master canceled", zap.String("id", worker.ID()))
   737  		needFailover = false
   738  		jm.jobOperatorNotifier.Notify()
   739  	} else if errors.Is(reason, errors.ErrWorkerFailed) {
   740  		log.Info("job master failed permanently", zap.String("id", worker.ID()))
   741  		needFailover = false
   742  	} else {
   743  		log.Info("on worker offline", zap.Any("id", worker.ID()), zap.Any("reason", reason))
   744  	}
   745  	ctx, cancel := context.WithTimeout(context.Background(), time.Second*5)
   746  	defer cancel()
   747  	if err := worker.GetTombstone().CleanTombstone(ctx); err != nil {
   748  		return err
   749  	}
   750  	if needFailover {
   751  		jm.JobBackoffMgr.JobFail(worker.ID())
   752  	} else {
   753  		jm.JobBackoffMgr.JobTerminate(worker.ID())
   754  	}
   755  	jm.JobFsm.JobOffline(worker, needFailover)
   756  	return nil
   757  }
   758  
   759  // OnWorkerMessage implements frame.MasterImpl.OnWorkerMessage
   760  func (jm *JobManagerImpl) OnWorkerMessage(worker framework.WorkerHandle, topic p2p.Topic, message interface{}) error {
   761  	log.Info("on worker message", zap.Any("id", worker.ID()), zap.Any("topic", topic), zap.Any("message", message))
   762  	return nil
   763  }
   764  
   765  // OnWorkerStatusUpdated implements frame.MasterImpl.OnWorkerStatusUpdated
   766  func (jm *JobManagerImpl) OnWorkerStatusUpdated(worker framework.WorkerHandle, newStatus *frameModel.WorkerStatus) error {
   767  	log.Info("on worker status updated", zap.String("worker-id", worker.ID()), zap.Any("status", newStatus))
   768  	return nil
   769  }
   770  
   771  // CloseImpl implements frame.MasterImpl.CloseImpl
   772  func (jm *JobManagerImpl) CloseImpl(ctx context.Context) {
   773  	jm.notifier.Close()
   774  	jm.jobHTTPClient.Close()
   775  	jm.jobOperatorNotifier.Close()
   776  }
   777  
   778  // StopImpl implements frame.MasterImpl.StopImpl
   779  func (jm *JobManagerImpl) StopImpl(ctx context.Context) {
   780  	jm.CloseImpl(ctx)
   781  }
   782  
   783  // WatchJobStatuses returns a snapshot of job statuses followed by a stream
   784  // of job status changes.
   785  func (jm *JobManagerImpl) WatchJobStatuses(
   786  	ctx context.Context,
   787  ) (resManager.JobStatusesSnapshot, *notifier.Receiver[resManager.JobStatusChangeEvent], error) {
   788  	// We add an explicit deadline to make sure that
   789  	// any potential problem will not block the JobManager forever.
   790  	ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
   791  	defer cancel()
   792  
   793  	// Note that the lock is cancellable by the context.
   794  	if ok := jm.jobStatusChangeMu.Lock(ctx); !ok {
   795  		return nil, nil, errors.Trace(ctx.Err())
   796  	}
   797  	defer jm.jobStatusChangeMu.Unlock()
   798  
   799  	snapshot, err := jm.GetJobStatuses(ctx)
   800  	if err != nil {
   801  		return nil, nil, err
   802  	}
   803  
   804  	// Waits for pending JobStatusChangeEvents to be flushed,
   805  	// so that the new receiver does not receive any stale data.
   806  	err = jm.notifier.Flush(ctx)
   807  	if err != nil {
   808  		return nil, nil, errors.Trace(err)
   809  	}
   810  
   811  	receiver := jm.notifier.NewReceiver()
   812  	return snapshot, receiver, nil
   813  }
   814  
   815  func (jm *JobManagerImpl) bgJobOperatorLoop(ctx context.Context) {
   816  	jm.wg.Go(func() error {
   817  		defer func() {
   818  			log.Info("job manager job operator loop exited")
   819  		}()
   820  		receiver, err := jm.jobOperatorNotifier.NewReceiver(jobOperateInterval)
   821  		if err != nil {
   822  			return err
   823  		}
   824  		defer receiver.Stop()
   825  		for {
   826  			select {
   827  			case <-ctx.Done():
   828  				return errors.Trace(ctx.Err())
   829  			case _, ok := <-receiver.C:
   830  				if !ok {
   831  					return nil
   832  				}
   833  			}
   834  			if err := jm.jobOperator.Tick(ctx); err != nil {
   835  				// error returns from Tick is only caused by metastore error, so
   836  				// only log it and retry later.
   837  				log.Warn("job operator tick with error", zap.Error(err))
   838  			}
   839  		}
   840  	})
   841  }
   842  
   843  func (jm *JobManagerImpl) frameworkCreateWorker(job *frameModel.MasterMeta) (string, error) {
   844  	return jm.BaseMaster.CreateWorker(job.Type, job,
   845  		framework.CreateWorkerWithSelectors(job.Ext.Selectors...))
   846  }
   847  
   848  func (jm *JobManagerImpl) terminateJob(
   849  	ctx context.Context, errMsg string, jobID string, state frameModel.MasterState,
   850  ) error {
   851  	log.Info("job master terminated", zap.String("job-id", jobID),
   852  		zap.String("error", errMsg), zap.Any("state", state))
   853  	ctx, cancel := context.WithTimeout(ctx, time.Second*5)
   854  	defer cancel()
   855  	return jm.UpdateJobStatus(ctx, jobID, errMsg, state)
   856  }