github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/framework/internal/master/worker_manager.go

github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/framework/internal/master/worker_manager.go (about)

     1  // Copyright 2022 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package master
    15  
    16  import (
    17  	"context"
    18  	"sync"
    19  	"time"
    20  
    21  	"github.com/pingcap/tiflow/engine/framework/config"
    22  	"github.com/pingcap/tiflow/engine/framework/metadata"
    23  	frameModel "github.com/pingcap/tiflow/engine/framework/model"
    24  	"github.com/pingcap/tiflow/engine/framework/statusutil"
    25  	"github.com/pingcap/tiflow/engine/model"
    26  	"github.com/pingcap/tiflow/engine/pkg/clock"
    27  	"github.com/pingcap/tiflow/engine/pkg/errctx"
    28  	pkgOrm "github.com/pingcap/tiflow/engine/pkg/orm"
    29  	"github.com/pingcap/tiflow/engine/pkg/p2p"
    30  	"github.com/pingcap/tiflow/pkg/errors"
    31  	"go.uber.org/zap"
    32  )
    33  
    34  type (
    35  	// Callback alias to worker callback function when there is no error along with.
    36  	Callback = func(ctx context.Context, handle WorkerHandle) error
    37  	// CallbackWithError alias to worker callback function when there could be an error along with.
    38  	CallbackWithError = func(ctx context.Context, handle WorkerHandle, err error) error
    39  )
    40  
    41  // WorkerManager manages all workers belonging to a job master
    42  type WorkerManager struct {
    43  	mu            sync.Mutex
    44  	workerEntries map[frameModel.WorkerID]*workerEntry
    45  	state         workerManagerState
    46  
    47  	workerMetaClient *metadata.WorkerStatusClient
    48  	messageSender    p2p.MessageSender
    49  
    50  	masterID frameModel.MasterID
    51  	epoch    frameModel.Epoch
    52  
    53  	onWorkerOnlined       Callback
    54  	onWorkerOfflined      CallbackWithError
    55  	onWorkerStatusUpdated Callback
    56  	onWorkerDispatched    CallbackWithError
    57  
    58  	eventQueue chan *masterEvent
    59  	closeCh    chan struct{}
    60  	errCenter  *errctx.ErrCenter
    61  	// allWorkersReady is **closed** when a heartbeat has been received
    62  	// from all workers recorded in meta.
    63  	allWorkersReady chan struct{}
    64  	logger          *zap.Logger
    65  
    66  	clock clock.Clock
    67  
    68  	timeouts config.TimeoutConfig
    69  
    70  	wg sync.WaitGroup
    71  }
    72  
    73  type workerManagerState int32
    74  
    75  const (
    76  	workerManagerReady = workerManagerState(iota + 1)
    77  	workerManagerLoadingMeta
    78  	workerManagerWaitingHeartbeat
    79  )
    80  
    81  // NewWorkerManager creates a new WorkerManager instance
    82  func NewWorkerManager(
    83  	masterID frameModel.MasterID,
    84  	epoch frameModel.Epoch,
    85  	meta pkgOrm.Client,
    86  	messageSender p2p.MessageSender,
    87  	onWorkerOnline Callback,
    88  	onWorkerOffline CallbackWithError,
    89  	onWorkerStatusUpdated Callback,
    90  	onWorkerDispatched CallbackWithError,
    91  	isInit bool,
    92  	timeoutConfig config.TimeoutConfig,
    93  	clock clock.Clock,
    94  ) *WorkerManager {
    95  	state := workerManagerReady
    96  	if !isInit {
    97  		state = workerManagerLoadingMeta
    98  	}
    99  
   100  	ret := &WorkerManager{
   101  		workerEntries: make(map[frameModel.WorkerID]*workerEntry),
   102  		state:         state,
   103  
   104  		workerMetaClient: metadata.NewWorkerStatusClient(masterID, meta),
   105  		messageSender:    messageSender,
   106  
   107  		masterID: masterID,
   108  		epoch:    epoch,
   109  
   110  		onWorkerOnlined:       onWorkerOnline,
   111  		onWorkerOfflined:      onWorkerOffline,
   112  		onWorkerStatusUpdated: onWorkerStatusUpdated,
   113  		onWorkerDispatched:    onWorkerDispatched,
   114  
   115  		eventQueue:      make(chan *masterEvent, 1024),
   116  		closeCh:         make(chan struct{}),
   117  		errCenter:       errctx.NewErrCenter(),
   118  		allWorkersReady: make(chan struct{}),
   119  
   120  		clock:    clock,
   121  		timeouts: timeoutConfig,
   122  	}
   123  
   124  	ret.wg.Add(1)
   125  	go func() {
   126  		defer ret.wg.Done()
   127  		if err := ret.runBackgroundChecker(); err != nil {
   128  			ret.errCenter.OnError(err)
   129  		}
   130  	}()
   131  
   132  	return ret
   133  }
   134  
   135  // Close closes the WorkerManager and waits all resource released.
   136  func (m *WorkerManager) Close() {
   137  	close(m.closeCh)
   138  	m.wg.Wait()
   139  }
   140  
   141  // InitAfterRecover should be called after the master has failed over.
   142  // This method will block until a timeout period for heartbeats has passed.
   143  func (m *WorkerManager) InitAfterRecover(ctx context.Context) (retErr error) {
   144  	defer func() {
   145  		if retErr != nil {
   146  			m.errCenter.OnError(retErr)
   147  		}
   148  	}()
   149  
   150  	ctx, cancel := m.errCenter.WithCancelOnFirstError(ctx)
   151  	defer cancel()
   152  
   153  	m.mu.Lock()
   154  	if m.state != workerManagerLoadingMeta {
   155  		// InitAfterRecover should only be called if
   156  		// NewWorkerManager has been called with isInit as false.
   157  		m.logger.Panic("Unreachable", zap.String("master-id", m.masterID))
   158  	}
   159  
   160  	// Unlock here because loading meta involves I/O, which can be long.
   161  	m.mu.Unlock()
   162  
   163  	allPersistedWorkers, err := m.workerMetaClient.LoadAllWorkers(ctx)
   164  	if err != nil {
   165  		return err
   166  	}
   167  
   168  	m.mu.Lock()
   169  	for workerID, status := range allPersistedWorkers {
   170  		entry := newWaitingWorkerEntry(workerID, status)
   171  		// TODO: refine mapping from worker status to worker entry state
   172  		if status.State == frameModel.WorkerStateFinished {
   173  			continue
   174  		}
   175  		m.workerEntries[workerID] = entry
   176  	}
   177  
   178  	if len(m.workerEntries) == 0 {
   179  		// Fast path when there is no active worker.
   180  		m.state = workerManagerReady
   181  		m.mu.Unlock()
   182  		return nil
   183  	}
   184  
   185  	m.state = workerManagerWaitingHeartbeat
   186  	m.mu.Unlock()
   187  
   188  	timeoutInterval := m.timeouts.WorkerTimeoutDuration + m.timeouts.WorkerTimeoutGracefulDuration
   189  
   190  	timer := m.clock.Timer(timeoutInterval)
   191  	defer timer.Stop()
   192  
   193  	startTime := m.clock.Now()
   194  	select {
   195  	case <-ctx.Done():
   196  		return errors.Trace(ctx.Err())
   197  	case <-m.allWorkersReady:
   198  		m.logger.Info("All workers have sent heartbeats after master failover. Resuming right now.",
   199  			zap.Duration("duration", m.clock.Since(startTime)))
   200  	case <-timer.C:
   201  		// Wait for the worker timeout to expire
   202  	}
   203  
   204  	m.mu.Lock()
   205  	for _, entry := range m.workerEntries {
   206  		if entry.State() == workerEntryWait || entry.IsFinished() {
   207  			entry.MarkAsTombstone()
   208  		}
   209  	}
   210  	m.state = workerManagerReady
   211  	m.mu.Unlock()
   212  
   213  	return nil
   214  }
   215  
   216  // HandleHeartbeat handles heartbeat ping message from a worker
   217  func (m *WorkerManager) HandleHeartbeat(msg *frameModel.HeartbeatPingMessage, fromNode p2p.NodeID) {
   218  	m.mu.Lock()
   219  	defer m.mu.Unlock()
   220  
   221  	if m.state == workerManagerLoadingMeta {
   222  		return
   223  	}
   224  
   225  	if !m.checkMasterEpochMatch(msg.Epoch) {
   226  		return
   227  	}
   228  
   229  	entry, exists := m.workerEntries[msg.FromWorkerID]
   230  	if !exists {
   231  		m.logger.Info("Message from stale worker dropped",
   232  			zap.String("master-id", m.masterID),
   233  			zap.Any("message", msg),
   234  			zap.String("from-node", fromNode))
   235  		return
   236  	}
   237  
   238  	epoch := entry.Status().Epoch
   239  	if !m.checkWorkerEpochMatch(epoch, msg.WorkerEpoch) {
   240  		return
   241  	}
   242  
   243  	if msg.IsFinished {
   244  		entry.SetFinished()
   245  	}
   246  
   247  	entry.SetExpireTime(m.nextExpireTime())
   248  
   249  	if m.state == workerManagerWaitingHeartbeat {
   250  		if entry.State() != workerEntryWait {
   251  			// We should allow multiple heartbeats during the
   252  			// workerManagerWaitingHeartbeat stage.
   253  			return
   254  		}
   255  
   256  		m.logger.Info("Worker discovered", zap.String("master-id", m.masterID),
   257  			zap.Any("worker-entry", entry))
   258  		entry.MarkAsOnline(model.ExecutorID(fromNode), m.nextExpireTime())
   259  
   260  		allReady := true
   261  		for _, e := range m.workerEntries {
   262  			if e.State() == workerEntryWait {
   263  				allReady = false
   264  				break
   265  			}
   266  		}
   267  		if allReady {
   268  			close(m.allWorkersReady)
   269  			m.logger.Info("All workers have sent heartbeats, sending signal to resume the master",
   270  				zap.String("master-id", m.masterID))
   271  		}
   272  	} else {
   273  		if entry.State() != workerEntryCreated {
   274  			// Return if it is not the first heartbeat.
   275  			return
   276  		}
   277  
   278  		entry.MarkAsOnline(model.ExecutorID(fromNode), m.nextExpireTime())
   279  
   280  		err := m.enqueueEvent(&masterEvent{
   281  			Tp:       workerOnlineEvent,
   282  			WorkerID: msg.FromWorkerID,
   283  			Handle: &runningHandleImpl{
   284  				workerID:   msg.FromWorkerID,
   285  				executorID: model.ExecutorID(fromNode),
   286  				manager:    m,
   287  			},
   288  		})
   289  		if err != nil {
   290  			m.errCenter.OnError(err)
   291  		}
   292  	}
   293  }
   294  
   295  // Tick should be called by the BaseMaster so that the callbacks can be
   296  // run in the main goroutine.
   297  func (m *WorkerManager) Tick(ctx context.Context) error {
   298  	if err := m.errCenter.CheckError(); err != nil {
   299  		return err
   300  	}
   301  
   302  	ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
   303  	defer cancel()
   304  	ctx, cancel = m.errCenter.WithCancelOnFirstError(ctx)
   305  	defer cancel()
   306  
   307  	for {
   308  		var event *masterEvent
   309  		select {
   310  		case <-ctx.Done():
   311  			return errors.Trace(ctx.Err())
   312  		case event = <-m.eventQueue:
   313  		default:
   314  			return nil
   315  		}
   316  
   317  		if event.beforeHook != nil {
   318  			if ok := event.beforeHook(); !ok {
   319  				// Continue to the next event.
   320  				continue
   321  			}
   322  		}
   323  
   324  		switch event.Tp {
   325  		case workerOnlineEvent:
   326  			if err := m.onWorkerOnlined(ctx, event.Handle); err != nil {
   327  				return err
   328  			}
   329  		case workerOfflineEvent:
   330  			if err := m.onWorkerOfflined(ctx, event.Handle, event.Err); err != nil {
   331  				return err
   332  			}
   333  		case workerStatusUpdatedEvent:
   334  			if err := m.onWorkerStatusUpdated(ctx, event.Handle); err != nil {
   335  				return err
   336  			}
   337  		case workerDispatchFailedEvent:
   338  			if err := m.onWorkerDispatched(ctx, event.Handle, event.Err); err != nil {
   339  				return err
   340  			}
   341  		}
   342  	}
   343  }
   344  
   345  // BeforeStartingWorker is called by the BaseMaster BEFORE the executor runs the worker,
   346  // but after the executor records the time at which the worker is submitted.
   347  func (m *WorkerManager) BeforeStartingWorker(
   348  	workerID frameModel.WorkerID, executorID model.ExecutorID, epoch frameModel.Epoch,
   349  ) {
   350  	m.mu.Lock()
   351  	defer m.mu.Unlock()
   352  
   353  	if _, exists := m.workerEntries[workerID]; exists {
   354  		m.logger.Panic("worker already exists", zap.String("worker-id", workerID))
   355  	}
   356  
   357  	m.workerEntries[workerID] = newWorkerEntry(
   358  		workerID,
   359  		executorID,
   360  		m.nextExpireTime(),
   361  		workerEntryCreated,
   362  		&frameModel.WorkerStatus{
   363  			State: frameModel.WorkerStateCreated,
   364  			Epoch: epoch,
   365  		},
   366  	)
   367  }
   368  
   369  // AbortCreatingWorker is called by BaseMaster if starting the worker has failed for sure.
   370  // NOTE: If the RPC used to start the worker returns errors such as Canceled or DeadlineExceeded,
   371  // it has NOT failed FOR SURE.
   372  func (m *WorkerManager) AbortCreatingWorker(workerID frameModel.WorkerID, errIn error) {
   373  	m.mu.Lock()
   374  	defer m.mu.Unlock()
   375  
   376  	event := &masterEvent{
   377  		Tp:       workerDispatchFailedEvent,
   378  		WorkerID: workerID,
   379  		Handle: &tombstoneHandleImpl{
   380  			workerID: workerID,
   381  			manager:  m,
   382  		},
   383  		Err: errIn,
   384  		beforeHook: func() bool {
   385  			m.mu.Lock()
   386  			defer m.mu.Unlock()
   387  
   388  			delete(m.workerEntries, workerID)
   389  			return true
   390  		},
   391  	}
   392  
   393  	err := m.enqueueEvent(event)
   394  	if err != nil {
   395  		m.errCenter.OnError(err)
   396  	}
   397  }
   398  
   399  // OnWorkerStatusUpdateMessage should be called in the message handler for WorkerStatusMessage.
   400  func (m *WorkerManager) OnWorkerStatusUpdateMessage(msg *statusutil.WorkerStatusMessage) {
   401  	m.mu.Lock()
   402  	defer m.mu.Unlock()
   403  
   404  	if !m.checkMasterEpochMatch(msg.MasterEpoch) {
   405  		return
   406  	}
   407  
   408  	entry, exists := m.workerEntries[msg.Worker]
   409  	if !exists {
   410  		m.logger.Info("WorkerStatusMessage dropped for unknown worker",
   411  			zap.String("master-id", m.masterID),
   412  			zap.Any("message", msg))
   413  		return
   414  	}
   415  
   416  	event := &masterEvent{
   417  		Tp: workerStatusUpdatedEvent,
   418  		Handle: &runningHandleImpl{
   419  			workerID:   msg.Worker,
   420  			executorID: entry.executorID,
   421  			manager:    m,
   422  		},
   423  		WorkerID: msg.Worker,
   424  		beforeHook: func() bool {
   425  			if entry.IsTombstone() {
   426  				// Cancel the event
   427  				return false
   428  			}
   429  			entry.UpdateStatus(msg.Status)
   430  			return true
   431  		},
   432  	}
   433  
   434  	if err := m.enqueueEvent(event); err != nil {
   435  		m.errCenter.OnError(err)
   436  		return
   437  	}
   438  }
   439  
   440  // GetWorkers gets all workers maintained by WorkerManager, including both running
   441  // workers and dead workers.
   442  func (m *WorkerManager) GetWorkers() map[frameModel.WorkerID]WorkerHandle {
   443  	m.mu.Lock()
   444  	defer m.mu.Unlock()
   445  
   446  	ret := make(map[frameModel.WorkerID]WorkerHandle, len(m.workerEntries))
   447  	for workerID, entry := range m.workerEntries {
   448  		if entry.IsTombstone() {
   449  			ret[workerID] = &tombstoneHandleImpl{
   450  				workerID: workerID,
   451  				manager:  m,
   452  			}
   453  			continue
   454  		}
   455  
   456  		ret[workerID] = &runningHandleImpl{
   457  			workerID:   workerID,
   458  			executorID: entry.executorID,
   459  			manager:    m,
   460  		}
   461  	}
   462  	return ret
   463  }
   464  
   465  // IsInitialized returns true after the worker manager has checked all tombstone
   466  // workers are online or dead.
   467  func (m *WorkerManager) IsInitialized() bool {
   468  	m.mu.Lock()
   469  	defer m.mu.Unlock()
   470  
   471  	return m.state == workerManagerReady
   472  }
   473  
   474  // WithLogger passes a logger.
   475  func (m *WorkerManager) WithLogger(logger *zap.Logger) *WorkerManager {
   476  	m.logger = logger
   477  	return m
   478  }
   479  
   480  func (m *WorkerManager) checkWorkerEntriesOnce() error {
   481  	m.mu.Lock()
   482  	defer m.mu.Unlock()
   483  
   484  	if m.state != workerManagerReady {
   485  		// We should not check for timeout during the waiting period,
   486  		// because timeouts during the waiting period is handled inside
   487  		// InitAfterRecover.
   488  		return nil
   489  	}
   490  
   491  	for workerID, entry := range m.workerEntries {
   492  		entry := entry
   493  		state := entry.State()
   494  		if state == workerEntryOffline || state == workerEntryTombstone {
   495  			// Prevent repeated delivery of the workerOffline event.
   496  			continue
   497  		}
   498  
   499  		hasTimedOut := entry.ExpireTime().Before(m.clock.Now())
   500  		shouldGoOffline := hasTimedOut || entry.IsFinished()
   501  		if !shouldGoOffline {
   502  			continue
   503  		}
   504  
   505  		// The worker has timed out, or has received a heartbeat
   506  		// with IsFinished == true.
   507  		entry.MarkAsOffline()
   508  
   509  		var offlineError error
   510  		if status := entry.Status(); status != nil {
   511  			switch status.State {
   512  			case frameModel.WorkerStateFinished:
   513  				offlineError = errors.ErrWorkerFinish.FastGenByArgs()
   514  			case frameModel.WorkerStateStopped:
   515  				offlineError = errors.ErrWorkerCancel.FastGenByArgs()
   516  			case frameModel.WorkerStateError:
   517  				offlineError = errors.ErrWorkerFailed.FastGenByArgs()
   518  			default:
   519  				offlineError = errors.ErrWorkerOffline.FastGenByArgs(workerID)
   520  			}
   521  		}
   522  
   523  		err := m.enqueueEvent(&masterEvent{
   524  			Tp:       workerOfflineEvent,
   525  			WorkerID: workerID,
   526  			Handle: &tombstoneHandleImpl{
   527  				workerID: workerID,
   528  				manager:  m,
   529  			},
   530  			Err: offlineError,
   531  			beforeHook: func() bool {
   532  				entry.MarkAsTombstone()
   533  				return true
   534  			},
   535  		})
   536  		if err != nil {
   537  			return err
   538  		}
   539  	}
   540  	return nil
   541  }
   542  
   543  func (m *WorkerManager) runBackgroundChecker() error {
   544  	ticker := m.clock.Ticker(m.timeouts.MasterHeartbeatCheckLoopInterval)
   545  	defer ticker.Stop()
   546  
   547  	for {
   548  		select {
   549  		case <-m.closeCh:
   550  			m.logger.Info("timeout checker exited", zap.String("master-id", m.masterID))
   551  			return nil
   552  		case <-ticker.C:
   553  			if err := m.checkWorkerEntriesOnce(); err != nil {
   554  				return err
   555  			}
   556  		}
   557  	}
   558  }
   559  
   560  func (m *WorkerManager) nextExpireTime() time.Time {
   561  	timeoutInterval := m.timeouts.WorkerTimeoutDuration + m.timeouts.WorkerTimeoutGracefulDuration
   562  	return m.clock.Now().Add(timeoutInterval)
   563  }
   564  
   565  func (m *WorkerManager) checkMasterEpochMatch(msgEpoch frameModel.Epoch) (ok bool) {
   566  	if msgEpoch > m.epoch {
   567  		// If there is a worker reporting to a master with a larger epoch, then
   568  		// we shouldn't be running.
   569  		// TODO We need to do some chaos testing to determining whether and how to
   570  		// handle this situation.
   571  		m.logger.Panic("We are a stale master still running",
   572  			zap.String("master-id", m.masterID),
   573  			zap.Int64("msg-epoch", msgEpoch),
   574  			zap.Int64("own-epoch", m.epoch))
   575  	}
   576  
   577  	if msgEpoch < m.epoch {
   578  		m.logger.Info("Message from smaller epoch dropped",
   579  			zap.String("master-id", m.masterID),
   580  			zap.Int64("msg-epoch", msgEpoch),
   581  			zap.Int64("own-epoch", m.epoch))
   582  		return false
   583  	}
   584  	return true
   585  }
   586  
   587  func (m *WorkerManager) checkWorkerEpochMatch(curEpoch, msgEpoch frameModel.Epoch) bool {
   588  	if msgEpoch > curEpoch {
   589  		m.logger.Panic("We are a stale master still running",
   590  			zap.String("master-id", m.masterID), zap.Int64("own-epoch", m.epoch),
   591  			zap.Int64("own-worker-epoch", curEpoch),
   592  			zap.Int64("msg-worker-epoch", msgEpoch),
   593  		)
   594  	}
   595  	if msgEpoch < curEpoch {
   596  		m.logger.Info("Message from small worker epoch dropped",
   597  			zap.String("master-id", m.masterID),
   598  			zap.Int64("own-worker-epoch", curEpoch),
   599  			zap.Int64("msg-worker-epoch", msgEpoch),
   600  		)
   601  		return false
   602  	}
   603  	return true
   604  }
   605  
   606  func (m *WorkerManager) enqueueEvent(event *masterEvent) error {
   607  	timer := time.NewTimer(1 * time.Second)
   608  	defer timer.Stop()
   609  
   610  	select {
   611  	case <-timer.C:
   612  		return errors.ErrMasterTooManyPendingEvents.GenWithStackByArgs()
   613  	case m.eventQueue <- event:
   614  	}
   615  
   616  	return nil
   617  }
   618  
   619  // removeTombstoneEntry removes a tombstone workerEntry from the in-memory map.
   620  // NOTE: removeTombstoneEntry is expected to be used by tombstoneHandleImpl only,
   621  // and it should NOT be called with m.mu taken.
   622  func (m *WorkerManager) removeTombstoneEntry(id frameModel.WorkerID) {
   623  	m.mu.Lock()
   624  	defer m.mu.Unlock()
   625  
   626  	// Checks precondition.
   627  	entry, exists := m.workerEntries[id]
   628  	if !exists {
   629  		// Return here. We intend this method to be idempotent.
   630  		return
   631  	}
   632  
   633  	if !entry.IsTombstone() {
   634  		m.logger.Panic("Unreachable: not a tombstone", zap.Stringer("entry", entry))
   635  	}
   636  
   637  	delete(m.workerEntries, id)
   638  }