github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/framework/internal/master/worker_manager.go (about) 1 // Copyright 2022 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package master 15 16 import ( 17 "context" 18 "sync" 19 "time" 20 21 "github.com/pingcap/tiflow/engine/framework/config" 22 "github.com/pingcap/tiflow/engine/framework/metadata" 23 frameModel "github.com/pingcap/tiflow/engine/framework/model" 24 "github.com/pingcap/tiflow/engine/framework/statusutil" 25 "github.com/pingcap/tiflow/engine/model" 26 "github.com/pingcap/tiflow/engine/pkg/clock" 27 "github.com/pingcap/tiflow/engine/pkg/errctx" 28 pkgOrm "github.com/pingcap/tiflow/engine/pkg/orm" 29 "github.com/pingcap/tiflow/engine/pkg/p2p" 30 "github.com/pingcap/tiflow/pkg/errors" 31 "go.uber.org/zap" 32 ) 33 34 type ( 35 // Callback alias to worker callback function when there is no error along with. 36 Callback = func(ctx context.Context, handle WorkerHandle) error 37 // CallbackWithError alias to worker callback function when there could be an error along with. 38 CallbackWithError = func(ctx context.Context, handle WorkerHandle, err error) error 39 ) 40 41 // WorkerManager manages all workers belonging to a job master 42 type WorkerManager struct { 43 mu sync.Mutex 44 workerEntries map[frameModel.WorkerID]*workerEntry 45 state workerManagerState 46 47 workerMetaClient *metadata.WorkerStatusClient 48 messageSender p2p.MessageSender 49 50 masterID frameModel.MasterID 51 epoch frameModel.Epoch 52 53 onWorkerOnlined Callback 54 onWorkerOfflined CallbackWithError 55 onWorkerStatusUpdated Callback 56 onWorkerDispatched CallbackWithError 57 58 eventQueue chan *masterEvent 59 closeCh chan struct{} 60 errCenter *errctx.ErrCenter 61 // allWorkersReady is **closed** when a heartbeat has been received 62 // from all workers recorded in meta. 63 allWorkersReady chan struct{} 64 logger *zap.Logger 65 66 clock clock.Clock 67 68 timeouts config.TimeoutConfig 69 70 wg sync.WaitGroup 71 } 72 73 type workerManagerState int32 74 75 const ( 76 workerManagerReady = workerManagerState(iota + 1) 77 workerManagerLoadingMeta 78 workerManagerWaitingHeartbeat 79 ) 80 81 // NewWorkerManager creates a new WorkerManager instance 82 func NewWorkerManager( 83 masterID frameModel.MasterID, 84 epoch frameModel.Epoch, 85 meta pkgOrm.Client, 86 messageSender p2p.MessageSender, 87 onWorkerOnline Callback, 88 onWorkerOffline CallbackWithError, 89 onWorkerStatusUpdated Callback, 90 onWorkerDispatched CallbackWithError, 91 isInit bool, 92 timeoutConfig config.TimeoutConfig, 93 clock clock.Clock, 94 ) *WorkerManager { 95 state := workerManagerReady 96 if !isInit { 97 state = workerManagerLoadingMeta 98 } 99 100 ret := &WorkerManager{ 101 workerEntries: make(map[frameModel.WorkerID]*workerEntry), 102 state: state, 103 104 workerMetaClient: metadata.NewWorkerStatusClient(masterID, meta), 105 messageSender: messageSender, 106 107 masterID: masterID, 108 epoch: epoch, 109 110 onWorkerOnlined: onWorkerOnline, 111 onWorkerOfflined: onWorkerOffline, 112 onWorkerStatusUpdated: onWorkerStatusUpdated, 113 onWorkerDispatched: onWorkerDispatched, 114 115 eventQueue: make(chan *masterEvent, 1024), 116 closeCh: make(chan struct{}), 117 errCenter: errctx.NewErrCenter(), 118 allWorkersReady: make(chan struct{}), 119 120 clock: clock, 121 timeouts: timeoutConfig, 122 } 123 124 ret.wg.Add(1) 125 go func() { 126 defer ret.wg.Done() 127 if err := ret.runBackgroundChecker(); err != nil { 128 ret.errCenter.OnError(err) 129 } 130 }() 131 132 return ret 133 } 134 135 // Close closes the WorkerManager and waits all resource released. 136 func (m *WorkerManager) Close() { 137 close(m.closeCh) 138 m.wg.Wait() 139 } 140 141 // InitAfterRecover should be called after the master has failed over. 142 // This method will block until a timeout period for heartbeats has passed. 143 func (m *WorkerManager) InitAfterRecover(ctx context.Context) (retErr error) { 144 defer func() { 145 if retErr != nil { 146 m.errCenter.OnError(retErr) 147 } 148 }() 149 150 ctx, cancel := m.errCenter.WithCancelOnFirstError(ctx) 151 defer cancel() 152 153 m.mu.Lock() 154 if m.state != workerManagerLoadingMeta { 155 // InitAfterRecover should only be called if 156 // NewWorkerManager has been called with isInit as false. 157 m.logger.Panic("Unreachable", zap.String("master-id", m.masterID)) 158 } 159 160 // Unlock here because loading meta involves I/O, which can be long. 161 m.mu.Unlock() 162 163 allPersistedWorkers, err := m.workerMetaClient.LoadAllWorkers(ctx) 164 if err != nil { 165 return err 166 } 167 168 m.mu.Lock() 169 for workerID, status := range allPersistedWorkers { 170 entry := newWaitingWorkerEntry(workerID, status) 171 // TODO: refine mapping from worker status to worker entry state 172 if status.State == frameModel.WorkerStateFinished { 173 continue 174 } 175 m.workerEntries[workerID] = entry 176 } 177 178 if len(m.workerEntries) == 0 { 179 // Fast path when there is no active worker. 180 m.state = workerManagerReady 181 m.mu.Unlock() 182 return nil 183 } 184 185 m.state = workerManagerWaitingHeartbeat 186 m.mu.Unlock() 187 188 timeoutInterval := m.timeouts.WorkerTimeoutDuration + m.timeouts.WorkerTimeoutGracefulDuration 189 190 timer := m.clock.Timer(timeoutInterval) 191 defer timer.Stop() 192 193 startTime := m.clock.Now() 194 select { 195 case <-ctx.Done(): 196 return errors.Trace(ctx.Err()) 197 case <-m.allWorkersReady: 198 m.logger.Info("All workers have sent heartbeats after master failover. Resuming right now.", 199 zap.Duration("duration", m.clock.Since(startTime))) 200 case <-timer.C: 201 // Wait for the worker timeout to expire 202 } 203 204 m.mu.Lock() 205 for _, entry := range m.workerEntries { 206 if entry.State() == workerEntryWait || entry.IsFinished() { 207 entry.MarkAsTombstone() 208 } 209 } 210 m.state = workerManagerReady 211 m.mu.Unlock() 212 213 return nil 214 } 215 216 // HandleHeartbeat handles heartbeat ping message from a worker 217 func (m *WorkerManager) HandleHeartbeat(msg *frameModel.HeartbeatPingMessage, fromNode p2p.NodeID) { 218 m.mu.Lock() 219 defer m.mu.Unlock() 220 221 if m.state == workerManagerLoadingMeta { 222 return 223 } 224 225 if !m.checkMasterEpochMatch(msg.Epoch) { 226 return 227 } 228 229 entry, exists := m.workerEntries[msg.FromWorkerID] 230 if !exists { 231 m.logger.Info("Message from stale worker dropped", 232 zap.String("master-id", m.masterID), 233 zap.Any("message", msg), 234 zap.String("from-node", fromNode)) 235 return 236 } 237 238 epoch := entry.Status().Epoch 239 if !m.checkWorkerEpochMatch(epoch, msg.WorkerEpoch) { 240 return 241 } 242 243 if msg.IsFinished { 244 entry.SetFinished() 245 } 246 247 entry.SetExpireTime(m.nextExpireTime()) 248 249 if m.state == workerManagerWaitingHeartbeat { 250 if entry.State() != workerEntryWait { 251 // We should allow multiple heartbeats during the 252 // workerManagerWaitingHeartbeat stage. 253 return 254 } 255 256 m.logger.Info("Worker discovered", zap.String("master-id", m.masterID), 257 zap.Any("worker-entry", entry)) 258 entry.MarkAsOnline(model.ExecutorID(fromNode), m.nextExpireTime()) 259 260 allReady := true 261 for _, e := range m.workerEntries { 262 if e.State() == workerEntryWait { 263 allReady = false 264 break 265 } 266 } 267 if allReady { 268 close(m.allWorkersReady) 269 m.logger.Info("All workers have sent heartbeats, sending signal to resume the master", 270 zap.String("master-id", m.masterID)) 271 } 272 } else { 273 if entry.State() != workerEntryCreated { 274 // Return if it is not the first heartbeat. 275 return 276 } 277 278 entry.MarkAsOnline(model.ExecutorID(fromNode), m.nextExpireTime()) 279 280 err := m.enqueueEvent(&masterEvent{ 281 Tp: workerOnlineEvent, 282 WorkerID: msg.FromWorkerID, 283 Handle: &runningHandleImpl{ 284 workerID: msg.FromWorkerID, 285 executorID: model.ExecutorID(fromNode), 286 manager: m, 287 }, 288 }) 289 if err != nil { 290 m.errCenter.OnError(err) 291 } 292 } 293 } 294 295 // Tick should be called by the BaseMaster so that the callbacks can be 296 // run in the main goroutine. 297 func (m *WorkerManager) Tick(ctx context.Context) error { 298 if err := m.errCenter.CheckError(); err != nil { 299 return err 300 } 301 302 ctx, cancel := context.WithTimeout(ctx, 5*time.Second) 303 defer cancel() 304 ctx, cancel = m.errCenter.WithCancelOnFirstError(ctx) 305 defer cancel() 306 307 for { 308 var event *masterEvent 309 select { 310 case <-ctx.Done(): 311 return errors.Trace(ctx.Err()) 312 case event = <-m.eventQueue: 313 default: 314 return nil 315 } 316 317 if event.beforeHook != nil { 318 if ok := event.beforeHook(); !ok { 319 // Continue to the next event. 320 continue 321 } 322 } 323 324 switch event.Tp { 325 case workerOnlineEvent: 326 if err := m.onWorkerOnlined(ctx, event.Handle); err != nil { 327 return err 328 } 329 case workerOfflineEvent: 330 if err := m.onWorkerOfflined(ctx, event.Handle, event.Err); err != nil { 331 return err 332 } 333 case workerStatusUpdatedEvent: 334 if err := m.onWorkerStatusUpdated(ctx, event.Handle); err != nil { 335 return err 336 } 337 case workerDispatchFailedEvent: 338 if err := m.onWorkerDispatched(ctx, event.Handle, event.Err); err != nil { 339 return err 340 } 341 } 342 } 343 } 344 345 // BeforeStartingWorker is called by the BaseMaster BEFORE the executor runs the worker, 346 // but after the executor records the time at which the worker is submitted. 347 func (m *WorkerManager) BeforeStartingWorker( 348 workerID frameModel.WorkerID, executorID model.ExecutorID, epoch frameModel.Epoch, 349 ) { 350 m.mu.Lock() 351 defer m.mu.Unlock() 352 353 if _, exists := m.workerEntries[workerID]; exists { 354 m.logger.Panic("worker already exists", zap.String("worker-id", workerID)) 355 } 356 357 m.workerEntries[workerID] = newWorkerEntry( 358 workerID, 359 executorID, 360 m.nextExpireTime(), 361 workerEntryCreated, 362 &frameModel.WorkerStatus{ 363 State: frameModel.WorkerStateCreated, 364 Epoch: epoch, 365 }, 366 ) 367 } 368 369 // AbortCreatingWorker is called by BaseMaster if starting the worker has failed for sure. 370 // NOTE: If the RPC used to start the worker returns errors such as Canceled or DeadlineExceeded, 371 // it has NOT failed FOR SURE. 372 func (m *WorkerManager) AbortCreatingWorker(workerID frameModel.WorkerID, errIn error) { 373 m.mu.Lock() 374 defer m.mu.Unlock() 375 376 event := &masterEvent{ 377 Tp: workerDispatchFailedEvent, 378 WorkerID: workerID, 379 Handle: &tombstoneHandleImpl{ 380 workerID: workerID, 381 manager: m, 382 }, 383 Err: errIn, 384 beforeHook: func() bool { 385 m.mu.Lock() 386 defer m.mu.Unlock() 387 388 delete(m.workerEntries, workerID) 389 return true 390 }, 391 } 392 393 err := m.enqueueEvent(event) 394 if err != nil { 395 m.errCenter.OnError(err) 396 } 397 } 398 399 // OnWorkerStatusUpdateMessage should be called in the message handler for WorkerStatusMessage. 400 func (m *WorkerManager) OnWorkerStatusUpdateMessage(msg *statusutil.WorkerStatusMessage) { 401 m.mu.Lock() 402 defer m.mu.Unlock() 403 404 if !m.checkMasterEpochMatch(msg.MasterEpoch) { 405 return 406 } 407 408 entry, exists := m.workerEntries[msg.Worker] 409 if !exists { 410 m.logger.Info("WorkerStatusMessage dropped for unknown worker", 411 zap.String("master-id", m.masterID), 412 zap.Any("message", msg)) 413 return 414 } 415 416 event := &masterEvent{ 417 Tp: workerStatusUpdatedEvent, 418 Handle: &runningHandleImpl{ 419 workerID: msg.Worker, 420 executorID: entry.executorID, 421 manager: m, 422 }, 423 WorkerID: msg.Worker, 424 beforeHook: func() bool { 425 if entry.IsTombstone() { 426 // Cancel the event 427 return false 428 } 429 entry.UpdateStatus(msg.Status) 430 return true 431 }, 432 } 433 434 if err := m.enqueueEvent(event); err != nil { 435 m.errCenter.OnError(err) 436 return 437 } 438 } 439 440 // GetWorkers gets all workers maintained by WorkerManager, including both running 441 // workers and dead workers. 442 func (m *WorkerManager) GetWorkers() map[frameModel.WorkerID]WorkerHandle { 443 m.mu.Lock() 444 defer m.mu.Unlock() 445 446 ret := make(map[frameModel.WorkerID]WorkerHandle, len(m.workerEntries)) 447 for workerID, entry := range m.workerEntries { 448 if entry.IsTombstone() { 449 ret[workerID] = &tombstoneHandleImpl{ 450 workerID: workerID, 451 manager: m, 452 } 453 continue 454 } 455 456 ret[workerID] = &runningHandleImpl{ 457 workerID: workerID, 458 executorID: entry.executorID, 459 manager: m, 460 } 461 } 462 return ret 463 } 464 465 // IsInitialized returns true after the worker manager has checked all tombstone 466 // workers are online or dead. 467 func (m *WorkerManager) IsInitialized() bool { 468 m.mu.Lock() 469 defer m.mu.Unlock() 470 471 return m.state == workerManagerReady 472 } 473 474 // WithLogger passes a logger. 475 func (m *WorkerManager) WithLogger(logger *zap.Logger) *WorkerManager { 476 m.logger = logger 477 return m 478 } 479 480 func (m *WorkerManager) checkWorkerEntriesOnce() error { 481 m.mu.Lock() 482 defer m.mu.Unlock() 483 484 if m.state != workerManagerReady { 485 // We should not check for timeout during the waiting period, 486 // because timeouts during the waiting period is handled inside 487 // InitAfterRecover. 488 return nil 489 } 490 491 for workerID, entry := range m.workerEntries { 492 entry := entry 493 state := entry.State() 494 if state == workerEntryOffline || state == workerEntryTombstone { 495 // Prevent repeated delivery of the workerOffline event. 496 continue 497 } 498 499 hasTimedOut := entry.ExpireTime().Before(m.clock.Now()) 500 shouldGoOffline := hasTimedOut || entry.IsFinished() 501 if !shouldGoOffline { 502 continue 503 } 504 505 // The worker has timed out, or has received a heartbeat 506 // with IsFinished == true. 507 entry.MarkAsOffline() 508 509 var offlineError error 510 if status := entry.Status(); status != nil { 511 switch status.State { 512 case frameModel.WorkerStateFinished: 513 offlineError = errors.ErrWorkerFinish.FastGenByArgs() 514 case frameModel.WorkerStateStopped: 515 offlineError = errors.ErrWorkerCancel.FastGenByArgs() 516 case frameModel.WorkerStateError: 517 offlineError = errors.ErrWorkerFailed.FastGenByArgs() 518 default: 519 offlineError = errors.ErrWorkerOffline.FastGenByArgs(workerID) 520 } 521 } 522 523 err := m.enqueueEvent(&masterEvent{ 524 Tp: workerOfflineEvent, 525 WorkerID: workerID, 526 Handle: &tombstoneHandleImpl{ 527 workerID: workerID, 528 manager: m, 529 }, 530 Err: offlineError, 531 beforeHook: func() bool { 532 entry.MarkAsTombstone() 533 return true 534 }, 535 }) 536 if err != nil { 537 return err 538 } 539 } 540 return nil 541 } 542 543 func (m *WorkerManager) runBackgroundChecker() error { 544 ticker := m.clock.Ticker(m.timeouts.MasterHeartbeatCheckLoopInterval) 545 defer ticker.Stop() 546 547 for { 548 select { 549 case <-m.closeCh: 550 m.logger.Info("timeout checker exited", zap.String("master-id", m.masterID)) 551 return nil 552 case <-ticker.C: 553 if err := m.checkWorkerEntriesOnce(); err != nil { 554 return err 555 } 556 } 557 } 558 } 559 560 func (m *WorkerManager) nextExpireTime() time.Time { 561 timeoutInterval := m.timeouts.WorkerTimeoutDuration + m.timeouts.WorkerTimeoutGracefulDuration 562 return m.clock.Now().Add(timeoutInterval) 563 } 564 565 func (m *WorkerManager) checkMasterEpochMatch(msgEpoch frameModel.Epoch) (ok bool) { 566 if msgEpoch > m.epoch { 567 // If there is a worker reporting to a master with a larger epoch, then 568 // we shouldn't be running. 569 // TODO We need to do some chaos testing to determining whether and how to 570 // handle this situation. 571 m.logger.Panic("We are a stale master still running", 572 zap.String("master-id", m.masterID), 573 zap.Int64("msg-epoch", msgEpoch), 574 zap.Int64("own-epoch", m.epoch)) 575 } 576 577 if msgEpoch < m.epoch { 578 m.logger.Info("Message from smaller epoch dropped", 579 zap.String("master-id", m.masterID), 580 zap.Int64("msg-epoch", msgEpoch), 581 zap.Int64("own-epoch", m.epoch)) 582 return false 583 } 584 return true 585 } 586 587 func (m *WorkerManager) checkWorkerEpochMatch(curEpoch, msgEpoch frameModel.Epoch) bool { 588 if msgEpoch > curEpoch { 589 m.logger.Panic("We are a stale master still running", 590 zap.String("master-id", m.masterID), zap.Int64("own-epoch", m.epoch), 591 zap.Int64("own-worker-epoch", curEpoch), 592 zap.Int64("msg-worker-epoch", msgEpoch), 593 ) 594 } 595 if msgEpoch < curEpoch { 596 m.logger.Info("Message from small worker epoch dropped", 597 zap.String("master-id", m.masterID), 598 zap.Int64("own-worker-epoch", curEpoch), 599 zap.Int64("msg-worker-epoch", msgEpoch), 600 ) 601 return false 602 } 603 return true 604 } 605 606 func (m *WorkerManager) enqueueEvent(event *masterEvent) error { 607 timer := time.NewTimer(1 * time.Second) 608 defer timer.Stop() 609 610 select { 611 case <-timer.C: 612 return errors.ErrMasterTooManyPendingEvents.GenWithStackByArgs() 613 case m.eventQueue <- event: 614 } 615 616 return nil 617 } 618 619 // removeTombstoneEntry removes a tombstone workerEntry from the in-memory map. 620 // NOTE: removeTombstoneEntry is expected to be used by tombstoneHandleImpl only, 621 // and it should NOT be called with m.mu taken. 622 func (m *WorkerManager) removeTombstoneEntry(id frameModel.WorkerID) { 623 m.mu.Lock() 624 defer m.mu.Unlock() 625 626 // Checks precondition. 627 entry, exists := m.workerEntries[id] 628 if !exists { 629 // Return here. We intend this method to be idempotent. 630 return 631 } 632 633 if !entry.IsTombstone() { 634 m.logger.Panic("Unreachable: not a tombstone", zap.Stringer("entry", entry)) 635 } 636 637 delete(m.workerEntries, id) 638 }