github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/servermaster/job_fsm.go (about)

     1  // Copyright 2022 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package servermaster
    15  
    16  import (
    17  	"sync"
    18  
    19  	"github.com/pingcap/log"
    20  	pb "github.com/pingcap/tiflow/engine/enginepb"
    21  	"github.com/pingcap/tiflow/engine/framework"
    22  	frameModel "github.com/pingcap/tiflow/engine/framework/model"
    23  	"github.com/pingcap/tiflow/pkg/errors"
    24  	"go.uber.org/zap"
    25  )
    26  
    27  // JobHolder holds job meta and worker handle for a job.
    28  type JobHolder struct {
    29  	workerHandle framework.WorkerHandle
    30  	masterMeta   *frameModel.MasterMeta
    31  	// True means the job is loaded from metastore during jobmanager failover.
    32  	// Otherwise it is added by SubmitJob.
    33  	addFromFailover bool
    34  }
    35  
    36  // MasterMeta returns master meta of the job.
    37  func (jh *JobHolder) MasterMeta() *frameModel.MasterMeta {
    38  	return jh.masterMeta
    39  }
    40  
    41  // WorkerHandle returns the job master's worker handle.
    42  func (jh *JobHolder) WorkerHandle() framework.WorkerHandle {
    43  	return jh.workerHandle
    44  }
    45  
    46  // JobFsm manages state of all job masters, job master state forms a finite-state
    47  // machine. Note job master managed in JobFsm is in running status, which means
    48  // the job is not terminated or finished.
    49  //
    50  // ,-------.                   ,-------.            ,-------.       ,--------.
    51  // |WaitAck|                   |Online |            |Pending|       |Finished|
    52  // `---+---'                   `---+---'            `---+---'       `---+----'
    53  //
    54  //	|                           |                    |               |
    55  //	| Master                    |                    |               |
    56  //	|  .OnWorkerOnline          |                    |               |
    57  //	|-------------------------->|                    |               |
    58  //	|                           |                    |               |
    59  //	|                           | Master             |               |
    60  //	|                           |   .OnWorkerOffline |               |
    61  //	|                           |   (failover)       |               |
    62  //	|                           |------------------->|               |
    63  //	|                           |                    |               |
    64  //	|                           | Master             |               |
    65  //	|                           |   .OnWorkerOffline |               |
    66  //	|                           |   (finish)         |               |
    67  //	|                           |----------------------------------->|
    68  //	|                           |                    |               |
    69  //	| Master                    |                    |               |
    70  //	|  .OnWorkerOffline         |                    |               |
    71  //	|  (failover)               |                    |               |
    72  //	|----------------------------------------------->|               |
    73  //	|                           |                    |               |
    74  //	| Master                    |                    |               |
    75  //	|  .OnWorkerOffline         |                    |               |
    76  //	|  (finish)                 |                    |               |
    77  //	|--------------------------------------------------------------->|
    78  //	|                           |                    |               |
    79  //	|                           | Master             |               |
    80  //	|                           |   .CreateWorker    |               |
    81  //	|<-----------------------------------------------|               |
    82  //	|                           |                    |               |
    83  //	| Master                    |                    |               |
    84  //	|  .OnWorkerDispatched      |                    |               |
    85  //	|  (with error)             |                    |               |
    86  //	|----------------------------------------------->|               |
    87  //	|                           |                    |               |
    88  //	|                           |                    |               |
    89  //	|                           |                    |               |
    90  type JobFsm struct {
    91  	JobStats
    92  
    93  	jobsMu      sync.RWMutex
    94  	pendingJobs map[frameModel.MasterID]*frameModel.MasterMeta
    95  	waitAckJobs map[frameModel.MasterID]*JobHolder
    96  	onlineJobs  map[frameModel.MasterID]*JobHolder
    97  }
    98  
    99  // JobStats defines a statistics interface for JobFsm
   100  type JobStats interface {
   101  	JobCount(status pb.Job_State) int
   102  }
   103  
   104  // NewJobFsm creates a new job fsm
   105  func NewJobFsm() *JobFsm {
   106  	return &JobFsm{
   107  		pendingJobs: make(map[frameModel.MasterID]*frameModel.MasterMeta),
   108  		waitAckJobs: make(map[frameModel.MasterID]*JobHolder),
   109  		onlineJobs:  make(map[frameModel.MasterID]*JobHolder),
   110  	}
   111  }
   112  
   113  // QueryOnlineJob queries job from online job list
   114  func (fsm *JobFsm) QueryOnlineJob(jobID frameModel.MasterID) *JobHolder {
   115  	fsm.jobsMu.RLock()
   116  	defer fsm.jobsMu.RUnlock()
   117  	return fsm.onlineJobs[jobID]
   118  }
   119  
   120  // QueryJob queries job with given jobID and returns QueryJobResponse
   121  func (fsm *JobFsm) QueryJob(jobID frameModel.MasterID) *JobHolder {
   122  	fsm.jobsMu.Lock()
   123  	defer fsm.jobsMu.Unlock()
   124  
   125  	if meta, ok := fsm.pendingJobs[jobID]; ok {
   126  		return &JobHolder{
   127  			masterMeta: meta,
   128  		}
   129  	}
   130  
   131  	if job, ok := fsm.waitAckJobs[jobID]; ok {
   132  		return job
   133  	}
   134  
   135  	if job, ok := fsm.onlineJobs[jobID]; ok {
   136  		return job
   137  	}
   138  
   139  	return nil
   140  }
   141  
   142  // JobDispatched is called when a job is firstly created or server master is failovered
   143  func (fsm *JobFsm) JobDispatched(job *frameModel.MasterMeta, addFromFailover bool) {
   144  	fsm.jobsMu.Lock()
   145  	defer fsm.jobsMu.Unlock()
   146  	fsm.waitAckJobs[job.ID] = &JobHolder{
   147  		masterMeta:      job,
   148  		addFromFailover: addFromFailover,
   149  	}
   150  }
   151  
   152  // IterPendingJobs iterates all pending jobs and dispatch(via create worker) them again.
   153  func (fsm *JobFsm) IterPendingJobs(dispatchJobFn func(job *frameModel.MasterMeta) (string, error)) error {
   154  	fsm.jobsMu.Lock()
   155  	defer fsm.jobsMu.Unlock()
   156  
   157  	for oldJobID, job := range fsm.pendingJobs {
   158  		id, err := dispatchJobFn(job)
   159  		if err != nil {
   160  			// This job is being backoff, skip it and process other jobs.
   161  			if errors.Is(err, errors.ErrMasterCreateWorkerBackoff) {
   162  				continue
   163  			}
   164  			if errors.Is(err, errors.ErrMasterCreateWorkerTerminate) {
   165  				delete(fsm.pendingJobs, oldJobID)
   166  				continue
   167  			}
   168  			return err
   169  		}
   170  		delete(fsm.pendingJobs, oldJobID)
   171  		job.ID = id
   172  		fsm.waitAckJobs[id] = &JobHolder{
   173  			masterMeta: job,
   174  		}
   175  		log.Info("job master recovered", zap.Any("job", job))
   176  	}
   177  
   178  	return nil
   179  }
   180  
   181  // IterWaitAckJobs iterates wait ack jobs, failover them if they are added from failover
   182  func (fsm *JobFsm) IterWaitAckJobs(dispatchJobFn func(job *frameModel.MasterMeta) (string, error)) error {
   183  	fsm.jobsMu.Lock()
   184  	defer fsm.jobsMu.Unlock()
   185  
   186  	for id, job := range fsm.waitAckJobs {
   187  		if !job.addFromFailover {
   188  			continue
   189  		}
   190  		_, err := dispatchJobFn(job.masterMeta)
   191  		if err != nil {
   192  			return err
   193  		}
   194  		fsm.waitAckJobs[id].addFromFailover = false
   195  		log.Info("tombstone job master doesn't receive heartbeat in time, recreate it", zap.Any("job", job))
   196  	}
   197  
   198  	return nil
   199  }
   200  
   201  // JobOnline is called when the first heartbeat of job is received
   202  func (fsm *JobFsm) JobOnline(worker framework.WorkerHandle) error {
   203  	fsm.jobsMu.Lock()
   204  	defer fsm.jobsMu.Unlock()
   205  
   206  	job, ok := fsm.waitAckJobs[worker.ID()]
   207  	if !ok {
   208  		return errors.ErrWorkerNotFound.GenWithStackByArgs(worker.ID())
   209  	}
   210  	fsm.onlineJobs[worker.ID()] = &JobHolder{
   211  		workerHandle: worker,
   212  		masterMeta:   job.masterMeta,
   213  	}
   214  	delete(fsm.waitAckJobs, worker.ID())
   215  	return nil
   216  }
   217  
   218  // JobOffline is called when a job meets error or finishes
   219  func (fsm *JobFsm) JobOffline(worker framework.WorkerHandle, needFailover bool) {
   220  	fsm.jobsMu.Lock()
   221  	defer fsm.jobsMu.Unlock()
   222  
   223  	job, ok := fsm.onlineJobs[worker.ID()]
   224  	if ok {
   225  		delete(fsm.onlineJobs, worker.ID())
   226  	} else {
   227  		job, ok = fsm.waitAckJobs[worker.ID()]
   228  		if !ok {
   229  			log.Warn("unknown worker, ignore it", zap.String("id", worker.ID()))
   230  			return
   231  		}
   232  		delete(fsm.waitAckJobs, worker.ID())
   233  	}
   234  	if needFailover {
   235  		fsm.pendingJobs[worker.ID()] = job.masterMeta
   236  	}
   237  }
   238  
   239  // JobDispatchFailed is called when a job dispatch fails
   240  func (fsm *JobFsm) JobDispatchFailed(worker framework.WorkerHandle) error {
   241  	fsm.jobsMu.Lock()
   242  	defer fsm.jobsMu.Unlock()
   243  
   244  	job, ok := fsm.waitAckJobs[worker.ID()]
   245  	if !ok {
   246  		return errors.ErrWorkerNotFound.GenWithStackByArgs(worker.ID())
   247  	}
   248  	fsm.pendingJobs[worker.ID()] = job.masterMeta
   249  	delete(fsm.waitAckJobs, worker.ID())
   250  	return nil
   251  }
   252  
   253  // JobCount queries job count based on job status
   254  func (fsm *JobFsm) JobCount(status pb.Job_State) int {
   255  	fsm.jobsMu.RLock()
   256  	defer fsm.jobsMu.RUnlock()
   257  	switch status {
   258  	case pb.Job_Created:
   259  		return len(fsm.pendingJobs) + len(fsm.waitAckJobs)
   260  	case pb.Job_Running:
   261  		return len(fsm.onlineJobs)
   262  	default:
   263  		// TODO: support other job status count
   264  		return 0
   265  	}
   266  }