github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/servermaster/job_fsm.go (about) 1 // Copyright 2022 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package servermaster 15 16 import ( 17 "sync" 18 19 "github.com/pingcap/log" 20 pb "github.com/pingcap/tiflow/engine/enginepb" 21 "github.com/pingcap/tiflow/engine/framework" 22 frameModel "github.com/pingcap/tiflow/engine/framework/model" 23 "github.com/pingcap/tiflow/pkg/errors" 24 "go.uber.org/zap" 25 ) 26 27 // JobHolder holds job meta and worker handle for a job. 28 type JobHolder struct { 29 workerHandle framework.WorkerHandle 30 masterMeta *frameModel.MasterMeta 31 // True means the job is loaded from metastore during jobmanager failover. 32 // Otherwise it is added by SubmitJob. 33 addFromFailover bool 34 } 35 36 // MasterMeta returns master meta of the job. 37 func (jh *JobHolder) MasterMeta() *frameModel.MasterMeta { 38 return jh.masterMeta 39 } 40 41 // WorkerHandle returns the job master's worker handle. 42 func (jh *JobHolder) WorkerHandle() framework.WorkerHandle { 43 return jh.workerHandle 44 } 45 46 // JobFsm manages state of all job masters, job master state forms a finite-state 47 // machine. Note job master managed in JobFsm is in running status, which means 48 // the job is not terminated or finished. 49 // 50 // ,-------. ,-------. ,-------. ,--------. 51 // |WaitAck| |Online | |Pending| |Finished| 52 // `---+---' `---+---' `---+---' `---+----' 53 // 54 // | | | | 55 // | Master | | | 56 // | .OnWorkerOnline | | | 57 // |-------------------------->| | | 58 // | | | | 59 // | | Master | | 60 // | | .OnWorkerOffline | | 61 // | | (failover) | | 62 // | |------------------->| | 63 // | | | | 64 // | | Master | | 65 // | | .OnWorkerOffline | | 66 // | | (finish) | | 67 // | |----------------------------------->| 68 // | | | | 69 // | Master | | | 70 // | .OnWorkerOffline | | | 71 // | (failover) | | | 72 // |----------------------------------------------->| | 73 // | | | | 74 // | Master | | | 75 // | .OnWorkerOffline | | | 76 // | (finish) | | | 77 // |--------------------------------------------------------------->| 78 // | | | | 79 // | | Master | | 80 // | | .CreateWorker | | 81 // |<-----------------------------------------------| | 82 // | | | | 83 // | Master | | | 84 // | .OnWorkerDispatched | | | 85 // | (with error) | | | 86 // |----------------------------------------------->| | 87 // | | | | 88 // | | | | 89 // | | | | 90 type JobFsm struct { 91 JobStats 92 93 jobsMu sync.RWMutex 94 pendingJobs map[frameModel.MasterID]*frameModel.MasterMeta 95 waitAckJobs map[frameModel.MasterID]*JobHolder 96 onlineJobs map[frameModel.MasterID]*JobHolder 97 } 98 99 // JobStats defines a statistics interface for JobFsm 100 type JobStats interface { 101 JobCount(status pb.Job_State) int 102 } 103 104 // NewJobFsm creates a new job fsm 105 func NewJobFsm() *JobFsm { 106 return &JobFsm{ 107 pendingJobs: make(map[frameModel.MasterID]*frameModel.MasterMeta), 108 waitAckJobs: make(map[frameModel.MasterID]*JobHolder), 109 onlineJobs: make(map[frameModel.MasterID]*JobHolder), 110 } 111 } 112 113 // QueryOnlineJob queries job from online job list 114 func (fsm *JobFsm) QueryOnlineJob(jobID frameModel.MasterID) *JobHolder { 115 fsm.jobsMu.RLock() 116 defer fsm.jobsMu.RUnlock() 117 return fsm.onlineJobs[jobID] 118 } 119 120 // QueryJob queries job with given jobID and returns QueryJobResponse 121 func (fsm *JobFsm) QueryJob(jobID frameModel.MasterID) *JobHolder { 122 fsm.jobsMu.Lock() 123 defer fsm.jobsMu.Unlock() 124 125 if meta, ok := fsm.pendingJobs[jobID]; ok { 126 return &JobHolder{ 127 masterMeta: meta, 128 } 129 } 130 131 if job, ok := fsm.waitAckJobs[jobID]; ok { 132 return job 133 } 134 135 if job, ok := fsm.onlineJobs[jobID]; ok { 136 return job 137 } 138 139 return nil 140 } 141 142 // JobDispatched is called when a job is firstly created or server master is failovered 143 func (fsm *JobFsm) JobDispatched(job *frameModel.MasterMeta, addFromFailover bool) { 144 fsm.jobsMu.Lock() 145 defer fsm.jobsMu.Unlock() 146 fsm.waitAckJobs[job.ID] = &JobHolder{ 147 masterMeta: job, 148 addFromFailover: addFromFailover, 149 } 150 } 151 152 // IterPendingJobs iterates all pending jobs and dispatch(via create worker) them again. 153 func (fsm *JobFsm) IterPendingJobs(dispatchJobFn func(job *frameModel.MasterMeta) (string, error)) error { 154 fsm.jobsMu.Lock() 155 defer fsm.jobsMu.Unlock() 156 157 for oldJobID, job := range fsm.pendingJobs { 158 id, err := dispatchJobFn(job) 159 if err != nil { 160 // This job is being backoff, skip it and process other jobs. 161 if errors.Is(err, errors.ErrMasterCreateWorkerBackoff) { 162 continue 163 } 164 if errors.Is(err, errors.ErrMasterCreateWorkerTerminate) { 165 delete(fsm.pendingJobs, oldJobID) 166 continue 167 } 168 return err 169 } 170 delete(fsm.pendingJobs, oldJobID) 171 job.ID = id 172 fsm.waitAckJobs[id] = &JobHolder{ 173 masterMeta: job, 174 } 175 log.Info("job master recovered", zap.Any("job", job)) 176 } 177 178 return nil 179 } 180 181 // IterWaitAckJobs iterates wait ack jobs, failover them if they are added from failover 182 func (fsm *JobFsm) IterWaitAckJobs(dispatchJobFn func(job *frameModel.MasterMeta) (string, error)) error { 183 fsm.jobsMu.Lock() 184 defer fsm.jobsMu.Unlock() 185 186 for id, job := range fsm.waitAckJobs { 187 if !job.addFromFailover { 188 continue 189 } 190 _, err := dispatchJobFn(job.masterMeta) 191 if err != nil { 192 return err 193 } 194 fsm.waitAckJobs[id].addFromFailover = false 195 log.Info("tombstone job master doesn't receive heartbeat in time, recreate it", zap.Any("job", job)) 196 } 197 198 return nil 199 } 200 201 // JobOnline is called when the first heartbeat of job is received 202 func (fsm *JobFsm) JobOnline(worker framework.WorkerHandle) error { 203 fsm.jobsMu.Lock() 204 defer fsm.jobsMu.Unlock() 205 206 job, ok := fsm.waitAckJobs[worker.ID()] 207 if !ok { 208 return errors.ErrWorkerNotFound.GenWithStackByArgs(worker.ID()) 209 } 210 fsm.onlineJobs[worker.ID()] = &JobHolder{ 211 workerHandle: worker, 212 masterMeta: job.masterMeta, 213 } 214 delete(fsm.waitAckJobs, worker.ID()) 215 return nil 216 } 217 218 // JobOffline is called when a job meets error or finishes 219 func (fsm *JobFsm) JobOffline(worker framework.WorkerHandle, needFailover bool) { 220 fsm.jobsMu.Lock() 221 defer fsm.jobsMu.Unlock() 222 223 job, ok := fsm.onlineJobs[worker.ID()] 224 if ok { 225 delete(fsm.onlineJobs, worker.ID()) 226 } else { 227 job, ok = fsm.waitAckJobs[worker.ID()] 228 if !ok { 229 log.Warn("unknown worker, ignore it", zap.String("id", worker.ID())) 230 return 231 } 232 delete(fsm.waitAckJobs, worker.ID()) 233 } 234 if needFailover { 235 fsm.pendingJobs[worker.ID()] = job.masterMeta 236 } 237 } 238 239 // JobDispatchFailed is called when a job dispatch fails 240 func (fsm *JobFsm) JobDispatchFailed(worker framework.WorkerHandle) error { 241 fsm.jobsMu.Lock() 242 defer fsm.jobsMu.Unlock() 243 244 job, ok := fsm.waitAckJobs[worker.ID()] 245 if !ok { 246 return errors.ErrWorkerNotFound.GenWithStackByArgs(worker.ID()) 247 } 248 fsm.pendingJobs[worker.ID()] = job.masterMeta 249 delete(fsm.waitAckJobs, worker.ID()) 250 return nil 251 } 252 253 // JobCount queries job count based on job status 254 func (fsm *JobFsm) JobCount(status pb.Job_State) int { 255 fsm.jobsMu.RLock() 256 defer fsm.jobsMu.RUnlock() 257 switch status { 258 case pb.Job_Created: 259 return len(fsm.pendingJobs) + len(fsm.waitAckJobs) 260 case pb.Job_Running: 261 return len(fsm.onlineJobs) 262 default: 263 // TODO: support other job status count 264 return 0 265 } 266 }