github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/servermaster/jobmanager.go (about) 1 // Copyright 2022 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package servermaster 15 16 import ( 17 "context" 18 "encoding/json" 19 "regexp" 20 "sort" 21 "time" 22 23 "github.com/pingcap/log" 24 pb "github.com/pingcap/tiflow/engine/enginepb" 25 "github.com/pingcap/tiflow/engine/executor/cvs" 26 "github.com/pingcap/tiflow/engine/framework" 27 "github.com/pingcap/tiflow/engine/framework/metadata" 28 frameModel "github.com/pingcap/tiflow/engine/framework/model" 29 engineModel "github.com/pingcap/tiflow/engine/model" 30 "github.com/pingcap/tiflow/engine/pkg/clock" 31 dcontext "github.com/pingcap/tiflow/engine/pkg/context" 32 "github.com/pingcap/tiflow/engine/pkg/ctxmu" 33 resManager "github.com/pingcap/tiflow/engine/pkg/externalresource/manager" 34 engineHTTPUtil "github.com/pingcap/tiflow/engine/pkg/httputil" 35 "github.com/pingcap/tiflow/engine/pkg/notifier" 36 pkgOrm "github.com/pingcap/tiflow/engine/pkg/orm" 37 "github.com/pingcap/tiflow/engine/pkg/p2p" 38 "github.com/pingcap/tiflow/engine/pkg/tenant" 39 "github.com/pingcap/tiflow/engine/servermaster/jobop" 40 schedModel "github.com/pingcap/tiflow/engine/servermaster/scheduler/model" 41 "github.com/pingcap/tiflow/pkg/errors" 42 "github.com/pingcap/tiflow/pkg/httputil" 43 "github.com/pingcap/tiflow/pkg/label" 44 "github.com/pingcap/tiflow/pkg/notify" 45 "github.com/pingcap/tiflow/pkg/uuid" 46 "go.uber.org/zap" 47 "golang.org/x/sync/errgroup" 48 "google.golang.org/grpc/codes" 49 "google.golang.org/grpc/status" 50 "google.golang.org/protobuf/types/known/emptypb" 51 ) 52 53 // JobManager defines manager of job master 54 type JobManager interface { 55 framework.Master 56 JobStats 57 pb.JobManagerServer 58 59 GetJobMasterForwardAddress(ctx context.Context, jobID string) (string, error) 60 GetJobStatuses(ctx context.Context) (map[frameModel.MasterID]frameModel.MasterState, error) 61 UpdateJobStatus(ctx context.Context, jobID frameModel.MasterID, errMsg string, code frameModel.MasterState) error 62 WatchJobStatuses( 63 ctx context.Context, 64 ) (resManager.JobStatusesSnapshot, *notifier.Receiver[resManager.JobStatusChangeEvent], error) 65 } 66 67 const ( 68 defaultJobMasterCost = 1 69 jobOperateInterval = time.Second * 15 70 defaultHTTPTimeout = time.Second * 10 71 defaultListPageSize = 100 72 maxListPageSize = 1000 73 ) 74 75 var jobIDRegex = regexp.MustCompile(`^\w([-.\w]{0,61}\w)?$`) 76 77 // JobManagerImpl is a special job master that manages all the job masters, and notify the offline executor to them. 78 // worker state transition 79 // - submit new job, create job master successfully, then adds to the `waitAckJobs`. 80 // - receive worker online, move job from `waitAckJobs` to `onlineJobs`. 81 // - receive worker offline, move job from `onlineJobs` to `pendingJobs`. 82 // - Tick checks `pendingJobs` periodically and reschedules the jobs. 83 type JobManagerImpl struct { 84 framework.BaseMaster 85 *JobFsm 86 87 masterMetaClient *metadata.MasterMetadataClient 88 uuidGen uuid.Generator 89 clocker clock.Clock 90 frameMetaClient pkgOrm.Client 91 tombstoneCleaned bool 92 jobOperator jobop.JobOperator 93 jobOperatorNotifier *notify.Notifier 94 JobBackoffMgr jobop.BackoffManager 95 96 // jobStatusChangeMu must be taken when we try to create, delete, 97 // pause or resume a job. 98 // NOTE The concurrency management for the JobManager is not complete 99 // yet. We are prioritizing implementing all features. 100 // TODO We might add a pending operation queue in the future. 101 jobStatusChangeMu *ctxmu.CtxMutex 102 notifier *notifier.Notifier[resManager.JobStatusChangeEvent] 103 wg *errgroup.Group 104 105 // http client for the job detail 106 jobHTTPClient engineHTTPUtil.JobHTTPClient 107 } 108 109 // CancelJob implements JobManagerServer.CancelJob. 110 func (jm *JobManagerImpl) CancelJob(ctx context.Context, req *pb.CancelJobRequest) (*pb.Job, error) { 111 meta, err := jm.frameMetaClient.GetJobByID(ctx, req.Id) 112 if err != nil { 113 if pkgOrm.IsNotFoundError(err) { 114 return nil, errors.ErrJobNotFound.GenWithStackByArgs(req.Id) 115 } 116 return nil, err 117 } 118 119 pbJob, err := buildPBJob(meta, false /* includeConfig */) 120 if err != nil { 121 return nil, err 122 } 123 if isJobTerminated(meta.State) { 124 return pbJob, nil 125 } 126 127 if err := jm.jobOperator.MarkJobCanceling(ctx, req.Id); err != nil { 128 return nil, err 129 } 130 jm.jobOperatorNotifier.Notify() 131 pbJob.State = pb.Job_Canceling 132 return pbJob, nil 133 } 134 135 // SendCancelJobMessage implements operateRouter.SendCancelJobMessage 136 func (jm *JobManagerImpl) SendCancelJobMessage(ctx context.Context, jobID string) error { 137 job := jm.JobFsm.QueryOnlineJob(jobID) 138 if job == nil { 139 if _, err := jm.frameMetaClient.GetJobByID(ctx, jobID); pkgOrm.IsNotFoundError(err) { 140 return errors.ErrJobNotFound.GenWithStackByArgs(jobID) 141 } 142 return errors.ErrJobNotRunning.GenWithStackByArgs(jobID) 143 } 144 145 topic := frameModel.WorkerStatusChangeRequestTopic(jm.BaseMaster.MasterID(), job.WorkerHandle().ID()) 146 msg := &frameModel.StatusChangeRequest{ 147 SendTime: jm.clocker.Mono(), 148 FromMasterID: jm.BaseMaster.MasterID(), 149 Epoch: jm.BaseMaster.MasterMeta().Epoch, 150 ExpectState: frameModel.WorkerStateStopped, 151 } 152 handle := job.WorkerHandle().Unwrap() 153 if handle == nil { 154 return errors.ErrJobNotRunning.GenWithStackByArgs(jobID) 155 } 156 return handle.SendMessage(ctx, topic, msg, true /*nonblocking*/) 157 } 158 159 // DeleteJob implements JobManagerServer.DeleteJob. 160 func (jm *JobManagerImpl) DeleteJob(ctx context.Context, req *pb.DeleteJobRequest) (*emptypb.Empty, error) { 161 masterMeta, err := jm.frameMetaClient.GetJobByID(ctx, req.Id) 162 if err != nil { 163 if pkgOrm.IsNotFoundError(err) { 164 return nil, errors.ErrJobNotFound.GenWithStackByArgs(req.Id) 165 } 166 return nil, err 167 } 168 169 // Only terminated jobs can be deleted. 170 if !isJobTerminated(masterMeta.State) { 171 return nil, errors.ErrJobNotTerminated.GenWithStackByArgs(req.Id) 172 } 173 174 if err := jm.deleteJobMeta(ctx, req.Id); err != nil { 175 return nil, err 176 } 177 return &emptypb.Empty{}, nil 178 } 179 180 func isJobTerminated(state frameModel.MasterState) bool { 181 switch state { 182 case frameModel.MasterStateFinished, frameModel.MasterStateStopped, frameModel.MasterStateFailed: 183 return true 184 default: 185 return false 186 } 187 } 188 189 func (jm *JobManagerImpl) deleteJobMeta(ctx context.Context, jobID string) error { 190 ctx, cancel := context.WithTimeout(ctx, 5*time.Second) 191 defer cancel() 192 193 if ok := jm.jobStatusChangeMu.Lock(ctx); !ok { 194 return errors.Trace(ctx.Err()) 195 } 196 defer jm.jobStatusChangeMu.Unlock() 197 198 // Note that DeleteJob is a soft delete. 199 res, err := jm.frameMetaClient.DeleteJob(ctx, jobID) 200 if err != nil { 201 return err 202 } 203 if res.RowsAffected() == 0 { 204 log.Warn("Job not found in meta (or already deleted)", 205 zap.Any("job-id", jobID)) 206 } 207 208 jm.notifier.Notify(resManager.JobStatusChangeEvent{ 209 EventType: resManager.JobRemovedEvent, 210 JobID: jobID, 211 }) 212 return nil 213 } 214 215 // GetJob implements JobManagerServer.GetJob. 216 func (jm *JobManagerImpl) GetJob(ctx context.Context, req *pb.GetJobRequest) (*pb.Job, error) { 217 masterMeta, err := jm.frameMetaClient.GetJobByID(ctx, req.Id) 218 if err != nil { 219 if pkgOrm.IsNotFoundError(err) { 220 return nil, errors.ErrJobNotFound.GenWithStackByArgs(req.Id) 221 } 222 return nil, err 223 } 224 225 job, err := buildPBJob(masterMeta, req.IncludeConfig) 226 if err != nil { 227 return nil, err 228 } 229 jm.tryQueryJobDetail(ctx, masterMeta.Addr, job) 230 231 return job, nil 232 } 233 234 // CreateJob implements JobManagerServer.CreateJob. 235 func (jm *JobManagerImpl) CreateJob(ctx context.Context, req *pb.CreateJobRequest) (*pb.Job, error) { 236 if err := validateCreateJobRequest(req); err != nil { 237 return nil, err 238 } 239 240 selectors, err := convertSelectors(req) 241 if err != nil { 242 return nil, err 243 } 244 245 // TODO call jm.notifier.Notify when we want to support "add job" event. 246 log.Info("create job", zap.Any("job", req.Job), 247 zap.String("tenant_id", req.TenantId), zap.String("project_id", req.ProjectId)) 248 249 job := req.Job 250 if job.Id == "" { 251 job.Id = jm.uuidGen.NewString() 252 } 253 254 meta := &frameModel.MasterMeta{ 255 ProjectID: tenant.NewProjectInfo( 256 req.TenantId, 257 req.ProjectId, 258 ).UniqueID(), 259 ID: job.Id, 260 Config: job.Config, 261 State: frameModel.MasterStateUninit, 262 Ext: frameModel.MasterMetaExt{ 263 Selectors: selectors, 264 }, 265 } 266 switch job.Type { 267 case pb.Job_CVSDemo: 268 extConfig := &cvs.Config{} 269 if err := json.Unmarshal(job.Config, extConfig); err != nil { 270 return nil, status.Errorf(codes.InvalidArgument, "failed to decode config: %v", err) 271 } 272 meta.Type = frameModel.CvsJobMaster 273 case pb.Job_DM: 274 meta.Type = frameModel.DMJobMaster 275 case pb.Job_FakeJob: 276 meta.Type = frameModel.FakeJobMaster 277 default: 278 return nil, status.Errorf(codes.InvalidArgument, "job type %v is not supported", job.Type) 279 } 280 281 // create job master metadata before creating it. 282 if err := jm.frameMetaClient.InsertJob(ctx, meta); err != nil { 283 if pkgOrm.IsDuplicateEntryError(err) { 284 return nil, errors.ErrJobAlreadyExists.GenWithStackByArgs(job.Id) 285 } 286 return nil, err 287 } 288 289 // TODO: Refine me. split the BaseMaster 290 defaultMaster, ok := jm.BaseMaster.(interface { 291 SetProjectInfo(frameModel.MasterID, tenant.ProjectInfo) 292 }) 293 if ok { 294 defaultMaster.SetProjectInfo(meta.ID, tenant.NewProjectInfo(req.TenantId, req.ProjectId)) 295 } else { 296 log.Error("jobmanager don't have the 'SetProjectInfo' interface", 297 zap.String("masterID", meta.ID), 298 zap.Any("projectInfo", tenant.NewProjectInfo(req.TenantId, req.ProjectId))) 299 } 300 301 // CreateWorker here is to create job master actually 302 // TODO: use correct worker cost 303 workerID, err := jm.frameworkCreateWorker(meta) 304 if err != nil { 305 err2 := metadata.DeleteMasterMeta(ctx, jm.frameMetaClient, meta.ID) 306 if err2 != nil { 307 // TODO: add more GC mechanism if master meta is failed to delete 308 log.Error("failed to delete master meta", zap.Error(err2)) 309 } 310 311 log.Error("create job master met error", zap.Error(err)) 312 return nil, err 313 } 314 315 if workerID != job.Id { 316 log.Panic("job id is not equal to worker id of job master", zap.String("job-id", job.Id), zap.String("worker-id", workerID)) 317 } 318 jm.JobFsm.JobDispatched(meta, false /*addFromFailover*/) 319 320 return buildPBJob(meta, false /* includeConfig */) 321 } 322 323 func validateCreateJobRequest(req *pb.CreateJobRequest) error { 324 if req.Job == nil { 325 return status.Error(codes.InvalidArgument, "job must not be nil") 326 } 327 if req.Job.Id != "" && !jobIDRegex.MatchString(req.Job.Id) { 328 return status.Errorf(codes.InvalidArgument, "job id must match %s", jobIDRegex.String()) 329 } 330 if req.Job.Type == pb.Job_TypeUnknown { 331 return status.Error(codes.InvalidArgument, "job type must be specified") 332 } 333 return nil 334 } 335 336 func convertSelectors(req *pb.CreateJobRequest) ([]*label.Selector, error) { 337 if len(req.GetJob().Selectors) == 0 { 338 return nil, nil 339 } 340 341 ret := make([]*label.Selector, 0, len(req.GetJob().Selectors)) 342 for _, pbSel := range req.Job.Selectors { 343 sel, err := schedModel.SelectorFromPB(pbSel) 344 if err != nil { 345 return nil, err 346 } 347 if err := sel.Validate(); err != nil { 348 return nil, err 349 } 350 ret = append(ret, sel) 351 } 352 return ret, nil 353 } 354 355 // ListJobs implements JobManagerServer.ListJobs. 356 func (jm *JobManagerImpl) ListJobs(ctx context.Context, req *pb.ListJobsRequest) (*pb.ListJobsResponse, error) { 357 masterMetas, err := jm.frameMetaClient.QueryJobs(ctx) 358 if err != nil { 359 return nil, err 360 } 361 362 sort.Slice(masterMetas, func(i, j int) bool { 363 return masterMetas[i].ID < masterMetas[j].ID 364 }) 365 366 firstIdx := sort.Search(len(masterMetas), func(i int) bool { 367 return masterMetas[i].ID > req.PageToken 368 }) 369 370 pageSize := req.PageSize 371 if pageSize <= 0 { 372 pageSize = defaultListPageSize 373 } else if pageSize > maxListPageSize { 374 pageSize = maxListPageSize 375 } 376 377 resp := &pb.ListJobsResponse{} 378 for i := firstIdx; i < len(masterMetas); i++ { 379 if masterMetas[i].Type == frameModel.JobManager { 380 continue 381 } 382 383 job, err := buildPBJob(masterMetas[i], req.IncludeConfig) 384 if err != nil { 385 return nil, err 386 } 387 if req.Type != pb.Job_TypeUnknown && job.Type != req.Type { 388 continue 389 } 390 if req.State != pb.Job_StateUnknown && job.State != req.State { 391 continue 392 } 393 jm.tryQueryJobDetail(ctx, masterMetas[i].Addr, job) 394 395 resp.Jobs = append(resp.Jobs, job) 396 // Retrieve one more job to determine whether there is a next page. 397 if int32(len(resp.Jobs)) >= pageSize+1 { 398 break 399 } 400 } 401 402 if len(resp.Jobs) > int(pageSize) { 403 resp.Jobs = resp.Jobs[:pageSize] 404 resp.NextPageToken = resp.Jobs[pageSize-1].Id 405 } 406 return resp, nil 407 } 408 409 func (jm *JobManagerImpl) tryQueryJobDetail(ctx context.Context, jobMasterAddr string, job *pb.Job) { 410 // If job is not running, we can't query job detail from jobmaster. 411 if job.State != pb.Job_Running || jm.JobFsm.QueryOnlineJob(job.Id) == nil { 412 return 413 } 414 detail, httpErr := jm.jobHTTPClient.GetJobDetail(ctx, jobMasterAddr, job.Id) 415 if httpErr != nil { 416 job.Error = &pb.Job_Error{ 417 Code: httpErr.Code, 418 Message: httpErr.Message, 419 } 420 } else { 421 job.Detail = detail 422 } 423 } 424 425 func buildPBJob(masterMeta *frameModel.MasterMeta, includeConfig bool) (*pb.Job, error) { 426 var jobType pb.Job_Type 427 switch tp := framework.MustConvertWorkerType2JobType(masterMeta.Type); tp { 428 case engineModel.JobTypeCVSDemo: 429 jobType = pb.Job_CVSDemo 430 case engineModel.JobTypeDM: 431 jobType = pb.Job_DM 432 case engineModel.JobTypeCDC: 433 jobType = pb.Job_CDC 434 case engineModel.JobTypeFakeJob: 435 jobType = pb.Job_FakeJob 436 default: 437 return nil, errors.Errorf("job %s has unknown type %v", masterMeta.ID, masterMeta.Type) 438 } 439 440 var jobState pb.Job_State 441 switch masterMeta.State { 442 case frameModel.MasterStateUninit: 443 jobState = pb.Job_Created 444 case frameModel.MasterStateInit: 445 jobState = pb.Job_Running 446 case frameModel.MasterStateFinished: 447 jobState = pb.Job_Finished 448 case frameModel.MasterStateStopped: 449 jobState = pb.Job_Canceled 450 case frameModel.MasterStateFailed: 451 jobState = pb.Job_Failed 452 default: 453 return nil, errors.Errorf("job %s has unknown state %v", masterMeta.ID, masterMeta.State) 454 } 455 456 var selectors []*pb.Selector 457 for _, sel := range masterMeta.Ext.Selectors { 458 pbSel, err := schedModel.SelectorToPB(sel) 459 if err != nil { 460 return nil, errors.Annotate(err, "buildPBJob") 461 } 462 selectors = append(selectors, pbSel) 463 } 464 job := &pb.Job{ 465 Id: masterMeta.ID, 466 Type: jobType, 467 State: jobState, 468 Detail: masterMeta.Detail, 469 Error: &pb.Job_Error{ 470 Message: masterMeta.ErrorMsg, 471 }, 472 Selectors: selectors, 473 } 474 if includeConfig { 475 job.Config = masterMeta.Config 476 } 477 return job, nil 478 } 479 480 // GetJobMasterForwardAddress implements JobManager.GetJobMasterForwardAddress. 481 func (jm *JobManagerImpl) GetJobMasterForwardAddress(ctx context.Context, jobID string) (string, error) { 482 // Always query from database. Master meta in JobFsm may be out of date. 483 masterMeta, err := jm.frameMetaClient.GetJobByID(ctx, jobID) 484 if err != nil { 485 if pkgOrm.IsNotFoundError(err) { 486 return "", errors.ErrJobNotFound.GenWithStackByArgs(jobID) 487 } 488 return "", err 489 } 490 if masterMeta.State != frameModel.MasterStateInit || jm.JobFsm.QueryOnlineJob(jobID) == nil { 491 return "", errors.ErrJobNotRunning.GenWithStackByArgs(jobID) 492 } 493 return masterMeta.Addr, nil 494 } 495 496 // GetJobStatuses returns the status code of all jobs that are not deleted. 497 func (jm *JobManagerImpl) GetJobStatuses( 498 ctx context.Context, 499 ) (map[frameModel.MasterID]frameModel.MasterState, error) { 500 // BUG? NO filter in the implement 501 jobs, err := jm.frameMetaClient.QueryJobs(ctx) 502 if err != nil { 503 return nil, err 504 } 505 506 ret := make(map[frameModel.MasterID]frameModel.MasterState, len(jobs)) 507 for _, jobMeta := range jobs { 508 ret[jobMeta.ID] = jobMeta.State 509 } 510 return ret, nil 511 } 512 513 // UpdateJobStatus implements JobManager.UpdateJobStatus 514 func (jm *JobManagerImpl) UpdateJobStatus( 515 ctx context.Context, jobID frameModel.MasterID, errMsg string, code frameModel.MasterState, 516 ) error { 517 // Note since the job is not online, it is safe to get from metastore and then update 518 meta, err := jm.frameMetaClient.GetJobByID(ctx, jobID) 519 if err != nil { 520 return err 521 } 522 meta.ErrorMsg = errMsg 523 meta.State = code 524 return jm.frameMetaClient.UpsertJob(ctx, meta) 525 } 526 527 // NewJobManagerImpl creates a new JobManagerImpl instance 528 func NewJobManagerImpl( 529 dctx *dcontext.Context, 530 id frameModel.MasterID, 531 backoffConfig *jobop.BackoffConfig, 532 ) (*JobManagerImpl, error) { 533 metaCli, err := dctx.Deps().Construct(func(cli pkgOrm.Client) (pkgOrm.Client, error) { 534 return cli, nil 535 }) 536 if err != nil { 537 return nil, err 538 } 539 540 metaClient := metaCli.(pkgOrm.Client) 541 cli := metadata.NewMasterMetadataClient(id, metaClient) 542 543 httpCli, err := httputil.NewClient(nil) 544 if err != nil { 545 return nil, err 546 } 547 httpCli.SetTimeout(defaultHTTPTimeout) 548 549 clocker := clock.New() 550 impl := &JobManagerImpl{ 551 JobFsm: NewJobFsm(), 552 uuidGen: uuid.NewGenerator(), 553 masterMetaClient: cli, 554 clocker: clocker, 555 frameMetaClient: metaClient, 556 jobStatusChangeMu: ctxmu.New(), 557 notifier: notifier.NewNotifier[resManager.JobStatusChangeEvent](), 558 jobOperatorNotifier: new(notify.Notifier), 559 jobHTTPClient: engineHTTPUtil.NewJobHTTPClient(httpCli), 560 JobBackoffMgr: jobop.NewBackoffManagerImpl(clocker, backoffConfig), 561 } 562 impl.BaseMaster = framework.NewBaseMaster( 563 dctx, 564 impl, 565 id, 566 frameModel.JobManager, 567 ) 568 impl.jobOperator = jobop.NewJobOperatorImpl(metaClient, impl) 569 wg, ctx := errgroup.WithContext(dctx) 570 impl.wg = wg 571 572 // Note the meta data of job manager is not used, it is safe to overwrite it 573 // every time a new server master leader is elected. And we always mark the 574 // Initialized to true in order to trigger OnMasterRecovered of job manager. 575 meta := impl.MasterMeta() 576 meta.State = frameModel.MasterStateInit 577 err = metadata.StoreMasterMeta(ctx, impl.frameMetaClient, meta) 578 if err != nil { 579 return nil, err 580 } 581 err = impl.BaseMaster.Init(ctx) 582 if err != nil { 583 _ = impl.BaseMaster.Close(ctx) 584 return nil, err 585 } 586 impl.bgJobOperatorLoop(ctx) 587 588 return impl, err 589 } 590 591 // InitImpl implements frame.MasterImpl.InitImpl 592 func (jm *JobManagerImpl) InitImpl(ctx context.Context) error { 593 return nil 594 } 595 596 // Tick implements frame.MasterImpl.Tick 597 func (jm *JobManagerImpl) Tick(ctx context.Context) error { 598 filterQuotaError := func(err error) (exceedQuota bool, retErr error) { 599 if err == nil { 600 return false, nil 601 } 602 if errors.Is(err, errors.ErrMasterConcurrencyExceeded) { 603 log.Warn("create worker exceeds quota, retry later", zap.Error(err)) 604 return true, nil 605 } 606 return false, err 607 } 608 609 err := jm.JobFsm.IterPendingJobs( 610 func(job *frameModel.MasterMeta) (string, error) { 611 isJobCanceling := jm.jobOperator.IsJobCanceling(ctx, job.ID) 612 if isJobCanceling || jm.JobBackoffMgr.Terminate(job.ID) { 613 state := frameModel.MasterStateFailed 614 if isJobCanceling { 615 state = frameModel.MasterStateStopped 616 } 617 if err := jm.terminateJob(ctx, job.ErrorMsg, job.ID, state); err != nil { 618 return "", err 619 } 620 return "", errors.ErrMasterCreateWorkerTerminate.FastGenByArgs() 621 } 622 if !jm.JobBackoffMgr.Allow(job.ID) { 623 return "", errors.ErrMasterCreateWorkerBackoff.FastGenByArgs() 624 } 625 return jm.frameworkCreateWorker(job) 626 }) 627 if _, err = filterQuotaError(err); err != nil { 628 return err 629 } 630 631 if !jm.tombstoneCleaned && jm.BaseMaster.IsMasterReady() { 632 for _, worker := range jm.BaseMaster.GetWorkers() { 633 // clean tombstone workers from worker manager and they will be 634 // re-created in the following IterWaitAckJobs 635 tombstoneHandle := worker.GetTombstone() 636 if tombstoneHandle != nil { 637 if err := tombstoneHandle.CleanTombstone(ctx); err != nil { 638 return err 639 } 640 continue 641 } 642 // mark non-tombstone workers as online 643 err := jm.JobFsm.JobOnline(worker) 644 // ignore worker that is not in WaitAck list 645 if err != nil && !errors.Is(err, errors.ErrWorkerNotFound) { 646 return err 647 } 648 } 649 err = jm.JobFsm.IterWaitAckJobs( 650 func(job *frameModel.MasterMeta) (string, error) { 651 return jm.frameworkCreateWorker(job) 652 }) 653 exceedQuota, err := filterQuotaError(err) 654 if err != nil { 655 return err 656 } 657 // if met exceed quota error, the remaining jobs need to be failover in 658 // another tick 659 if !exceedQuota { 660 jm.tombstoneCleaned = true 661 } 662 } 663 664 return nil 665 } 666 667 // OnMasterRecovered implements frame.MasterImpl.OnMasterRecovered 668 func (jm *JobManagerImpl) OnMasterRecovered(ctx context.Context) error { 669 jobs, err := jm.masterMetaClient.LoadAllMasters(ctx) 670 if err != nil { 671 return err 672 } 673 674 // TODO: refine me, split the BaseMaster interface 675 impl, ok := jm.BaseMaster.(interface { 676 InitProjectInfosAfterRecover([]*frameModel.MasterMeta) 677 }) 678 if !ok { 679 log.Panic("unfound interface for BaseMaster", zap.String("interface", "InitProjectInfosAfterRecover")) 680 return errors.ErrMasterInterfaceNotFound.GenWithStackByArgs() 681 } 682 impl.InitProjectInfosAfterRecover(jobs) 683 684 for _, job := range jobs { 685 if job.Type == frameModel.JobManager { 686 continue 687 } 688 // TODO: filter the job in backend 689 if job.State.IsTerminatedState() { 690 log.Info("skip job in terminated status", zap.Any("job", job)) 691 continue 692 } 693 jm.JobFsm.JobDispatched(job, true /*addFromFailover*/) 694 log.Info("recover job, move it to WaitAck job queue", zap.Any("job", job)) 695 } 696 return nil 697 } 698 699 // OnWorkerDispatched implements frame.MasterImpl.OnWorkerDispatched 700 func (jm *JobManagerImpl) OnWorkerDispatched(worker framework.WorkerHandle, result error) error { 701 if result != nil { 702 if errors.Is(result, errors.ErrCreateWorkerTerminate) { 703 errMsg := result.Error() 704 if cause := errors.Cause(result); cause != nil { 705 errMsg = cause.Error() 706 } 707 if err := jm.terminateJob( 708 context.Background(), errMsg, worker.ID(), frameModel.MasterStateFailed, 709 ); err != nil { 710 return err 711 } 712 jm.JobFsm.JobOffline(worker, false /* needFailover */) 713 return nil 714 } 715 log.Warn("dispatch worker met error", zap.Error(result)) 716 jm.JobBackoffMgr.JobFail(worker.ID()) 717 return jm.JobFsm.JobDispatchFailed(worker) 718 } 719 return nil 720 } 721 722 // OnWorkerOnline implements frame.MasterImpl.OnWorkerOnline 723 func (jm *JobManagerImpl) OnWorkerOnline(worker framework.WorkerHandle) error { 724 log.Info("on worker online", zap.Any("id", worker.ID())) 725 jm.JobBackoffMgr.JobOnline(worker.ID()) 726 return jm.JobFsm.JobOnline(worker) 727 } 728 729 // OnWorkerOffline implements frame.MasterImpl.OnWorkerOffline 730 func (jm *JobManagerImpl) OnWorkerOffline(worker framework.WorkerHandle, reason error) error { 731 needFailover := true 732 if errors.Is(reason, errors.ErrWorkerFinish) { 733 log.Info("job master finished", zap.String("id", worker.ID())) 734 needFailover = false 735 } else if errors.Is(reason, errors.ErrWorkerCancel) { 736 log.Info("job master canceled", zap.String("id", worker.ID())) 737 needFailover = false 738 jm.jobOperatorNotifier.Notify() 739 } else if errors.Is(reason, errors.ErrWorkerFailed) { 740 log.Info("job master failed permanently", zap.String("id", worker.ID())) 741 needFailover = false 742 } else { 743 log.Info("on worker offline", zap.Any("id", worker.ID()), zap.Any("reason", reason)) 744 } 745 ctx, cancel := context.WithTimeout(context.Background(), time.Second*5) 746 defer cancel() 747 if err := worker.GetTombstone().CleanTombstone(ctx); err != nil { 748 return err 749 } 750 if needFailover { 751 jm.JobBackoffMgr.JobFail(worker.ID()) 752 } else { 753 jm.JobBackoffMgr.JobTerminate(worker.ID()) 754 } 755 jm.JobFsm.JobOffline(worker, needFailover) 756 return nil 757 } 758 759 // OnWorkerMessage implements frame.MasterImpl.OnWorkerMessage 760 func (jm *JobManagerImpl) OnWorkerMessage(worker framework.WorkerHandle, topic p2p.Topic, message interface{}) error { 761 log.Info("on worker message", zap.Any("id", worker.ID()), zap.Any("topic", topic), zap.Any("message", message)) 762 return nil 763 } 764 765 // OnWorkerStatusUpdated implements frame.MasterImpl.OnWorkerStatusUpdated 766 func (jm *JobManagerImpl) OnWorkerStatusUpdated(worker framework.WorkerHandle, newStatus *frameModel.WorkerStatus) error { 767 log.Info("on worker status updated", zap.String("worker-id", worker.ID()), zap.Any("status", newStatus)) 768 return nil 769 } 770 771 // CloseImpl implements frame.MasterImpl.CloseImpl 772 func (jm *JobManagerImpl) CloseImpl(ctx context.Context) { 773 jm.notifier.Close() 774 jm.jobHTTPClient.Close() 775 jm.jobOperatorNotifier.Close() 776 } 777 778 // StopImpl implements frame.MasterImpl.StopImpl 779 func (jm *JobManagerImpl) StopImpl(ctx context.Context) { 780 jm.CloseImpl(ctx) 781 } 782 783 // WatchJobStatuses returns a snapshot of job statuses followed by a stream 784 // of job status changes. 785 func (jm *JobManagerImpl) WatchJobStatuses( 786 ctx context.Context, 787 ) (resManager.JobStatusesSnapshot, *notifier.Receiver[resManager.JobStatusChangeEvent], error) { 788 // We add an explicit deadline to make sure that 789 // any potential problem will not block the JobManager forever. 790 ctx, cancel := context.WithTimeout(ctx, 10*time.Second) 791 defer cancel() 792 793 // Note that the lock is cancellable by the context. 794 if ok := jm.jobStatusChangeMu.Lock(ctx); !ok { 795 return nil, nil, errors.Trace(ctx.Err()) 796 } 797 defer jm.jobStatusChangeMu.Unlock() 798 799 snapshot, err := jm.GetJobStatuses(ctx) 800 if err != nil { 801 return nil, nil, err 802 } 803 804 // Waits for pending JobStatusChangeEvents to be flushed, 805 // so that the new receiver does not receive any stale data. 806 err = jm.notifier.Flush(ctx) 807 if err != nil { 808 return nil, nil, errors.Trace(err) 809 } 810 811 receiver := jm.notifier.NewReceiver() 812 return snapshot, receiver, nil 813 } 814 815 func (jm *JobManagerImpl) bgJobOperatorLoop(ctx context.Context) { 816 jm.wg.Go(func() error { 817 defer func() { 818 log.Info("job manager job operator loop exited") 819 }() 820 receiver, err := jm.jobOperatorNotifier.NewReceiver(jobOperateInterval) 821 if err != nil { 822 return err 823 } 824 defer receiver.Stop() 825 for { 826 select { 827 case <-ctx.Done(): 828 return errors.Trace(ctx.Err()) 829 case _, ok := <-receiver.C: 830 if !ok { 831 return nil 832 } 833 } 834 if err := jm.jobOperator.Tick(ctx); err != nil { 835 // error returns from Tick is only caused by metastore error, so 836 // only log it and retry later. 837 log.Warn("job operator tick with error", zap.Error(err)) 838 } 839 } 840 }) 841 } 842 843 func (jm *JobManagerImpl) frameworkCreateWorker(job *frameModel.MasterMeta) (string, error) { 844 return jm.BaseMaster.CreateWorker(job.Type, job, 845 framework.CreateWorkerWithSelectors(job.Ext.Selectors...)) 846 } 847 848 func (jm *JobManagerImpl) terminateJob( 849 ctx context.Context, errMsg string, jobID string, state frameModel.MasterState, 850 ) error { 851 log.Info("job master terminated", zap.String("job-id", jobID), 852 zap.String("error", errMsg), zap.Any("state", state)) 853 ctx, cancel := context.WithTimeout(ctx, time.Second*5) 854 defer cancel() 855 return jm.UpdateJobStatus(ctx, jobID, errMsg, state) 856 }