github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/framework/internal/master/worker_manager_test.go (about)

     1  // Copyright 2022 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package master
    15  
    16  import (
    17  	"context"
    18  	"sync"
    19  	"testing"
    20  	"time"
    21  
    22  	"github.com/pingcap/log"
    23  	"github.com/pingcap/tiflow/engine/framework/config"
    24  	"github.com/pingcap/tiflow/engine/framework/logutil"
    25  	"github.com/pingcap/tiflow/engine/framework/metadata"
    26  	frameModel "github.com/pingcap/tiflow/engine/framework/model"
    27  	"github.com/pingcap/tiflow/engine/framework/statusutil"
    28  	"github.com/pingcap/tiflow/engine/pkg/clock"
    29  	pkgOrm "github.com/pingcap/tiflow/engine/pkg/orm"
    30  	"github.com/pingcap/tiflow/engine/pkg/p2p"
    31  	"github.com/pingcap/tiflow/pkg/errors"
    32  	"github.com/stretchr/testify/require"
    33  	"go.uber.org/zap"
    34  	"golang.org/x/time/rate"
    35  )
    36  
    37  type workerManageTestSuite struct {
    38  	manager       *WorkerManager
    39  	masterNode    p2p.NodeID
    40  	meta          pkgOrm.Client
    41  	messageSender p2p.MessageSender
    42  	clock         *clock.Mock
    43  
    44  	events map[frameModel.WorkerID]*masterEvent
    45  }
    46  
    47  func (s *workerManageTestSuite) AdvanceClockBy(duration time.Duration) {
    48  	s.clock.Add(duration)
    49  }
    50  
    51  func (s *workerManageTestSuite) SimulateHeartbeat(
    52  	workerID frameModel.WorkerID,
    53  	epoch frameModel.Epoch, workerEpoch frameModel.Epoch,
    54  	node p2p.NodeID, isFinished bool,
    55  ) {
    56  	s.manager.HandleHeartbeat(&frameModel.HeartbeatPingMessage{
    57  		SendTime:     s.clock.Mono(),
    58  		FromWorkerID: workerID,
    59  		Epoch:        epoch,
    60  		WorkerEpoch:  workerEpoch,
    61  		IsFinished:   isFinished,
    62  	}, node)
    63  }
    64  
    65  func (s *workerManageTestSuite) SimulateWorkerUpdateStatus(
    66  	workerID frameModel.WorkerID, status *frameModel.WorkerStatus, epoch frameModel.Epoch,
    67  ) error {
    68  	err := s.meta.UpsertWorker(context.Background(), status)
    69  	if err != nil {
    70  		return err
    71  	}
    72  
    73  	s.manager.OnWorkerStatusUpdateMessage(&statusutil.WorkerStatusMessage{
    74  		Worker:      workerID,
    75  		MasterEpoch: epoch,
    76  		Status:      status,
    77  	})
    78  	return nil
    79  }
    80  
    81  func (s *workerManageTestSuite) PutMeta(workerID frameModel.WorkerID, status *frameModel.WorkerStatus) error {
    82  	status.JobID = "master-1"
    83  	status.ID = workerID
    84  	return s.meta.UpsertWorker(context.Background(), status)
    85  }
    86  
    87  func (s *workerManageTestSuite) onWorkerOnline(ctx context.Context, handle WorkerHandle) error {
    88  	if event, exists := s.events[handle.ID()]; exists {
    89  		log.Warn("found unexpected event", zap.Any("event", event))
    90  		return errors.New("unexpected event already exists")
    91  	}
    92  	s.events[handle.ID()] = &masterEvent{
    93  		Tp:     workerOnlineEvent,
    94  		Handle: handle,
    95  	}
    96  	return nil
    97  }
    98  
    99  func (s *workerManageTestSuite) onWorkerOffline(ctx context.Context, handle WorkerHandle, err error) error {
   100  	if event, exists := s.events[handle.ID()]; exists {
   101  		log.Warn("found unexpected event", zap.Any("event", event))
   102  		return errors.New("unexpected event already exists")
   103  	}
   104  	s.events[handle.ID()] = &masterEvent{
   105  		Tp:     workerOfflineEvent,
   106  		Handle: handle,
   107  		Err:    err,
   108  	}
   109  	return nil
   110  }
   111  
   112  func (s *workerManageTestSuite) onWorkerStatusUpdated(ctx context.Context, handle WorkerHandle) error {
   113  	if event, exists := s.events[handle.ID()]; exists {
   114  		log.Warn("found unexpected event", zap.Any("event", event))
   115  		return errors.New("unexpected event already exists")
   116  	}
   117  	s.events[handle.ID()] = &masterEvent{
   118  		Tp:     workerStatusUpdatedEvent,
   119  		Handle: handle,
   120  	}
   121  	return nil
   122  }
   123  
   124  func (s *workerManageTestSuite) onWorkerDispatched(ctx context.Context, handle WorkerHandle, err error) error {
   125  	if event, exists := s.events[handle.ID()]; exists {
   126  		log.Warn("found unexpected event", zap.Any("event", event))
   127  		return errors.New("unexpected event already exists")
   128  	}
   129  	s.events[handle.ID()] = &masterEvent{
   130  		Tp:     workerDispatchFailedEvent,
   131  		Handle: handle,
   132  		Err:    err,
   133  	}
   134  	return nil
   135  }
   136  
   137  func (s *workerManageTestSuite) WaitForEvent(t *testing.T, workerID frameModel.WorkerID) *masterEvent {
   138  	timeoutCtx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
   139  	defer cancel()
   140  
   141  	rl := rate.NewLimiter(rate.Every(10*time.Millisecond), 1)
   142  
   143  	for {
   144  		select {
   145  		case <-timeoutCtx.Done():
   146  			t.Fatalf("waitForEventTimed out, workerID: %s", workerID)
   147  		default:
   148  		}
   149  
   150  		// The Tick should return very quickly.
   151  		tickCtx, cancel := context.WithTimeout(timeoutCtx, 100*time.Millisecond)
   152  		err := s.manager.Tick(tickCtx)
   153  		cancel()
   154  		require.NoError(t, err)
   155  
   156  		event, exists := s.events[workerID]
   157  		if !exists {
   158  			err := rl.Wait(timeoutCtx)
   159  			require.NoError(t, err)
   160  
   161  			s.AdvanceClockBy(1 * time.Second)
   162  			continue
   163  		}
   164  
   165  		require.Equal(t, workerID, event.Handle.ID())
   166  		delete(s.events, workerID)
   167  		return event
   168  	}
   169  }
   170  
   171  func (s *workerManageTestSuite) AssertNoEvents(t *testing.T, workerID frameModel.WorkerID, waitFor time.Duration) {
   172  	timeoutCtx, cancel := context.WithTimeout(context.Background(), waitFor)
   173  	defer cancel()
   174  
   175  	rl := rate.NewLimiter(rate.Every(10*time.Millisecond), 1)
   176  
   177  	for {
   178  		select {
   179  		case <-timeoutCtx.Done():
   180  			return
   181  		default:
   182  		}
   183  
   184  		// The Tick should return very quickly.
   185  		tickCtx, cancel := context.WithTimeout(timeoutCtx, 100*time.Millisecond)
   186  		err := s.manager.Tick(tickCtx)
   187  		cancel()
   188  		if err != nil {
   189  			if context.DeadlineExceeded == errors.Cause(err) {
   190  				return
   191  			}
   192  			require.NoError(t, err)
   193  		}
   194  
   195  		_, exists := s.events[workerID]
   196  		require.False(t, exists)
   197  
   198  		_ = rl.Wait(timeoutCtx)
   199  	}
   200  }
   201  
   202  func (s *workerManageTestSuite) Close() {
   203  	s.manager.Close()
   204  	// Prevents SQL connection leak.
   205  	_ = s.meta.Close()
   206  }
   207  
   208  func NewWorkerManageTestSuite(isInit bool) *workerManageTestSuite {
   209  	cli, err := pkgOrm.NewMockClient()
   210  	if err != nil {
   211  		panic(err)
   212  	}
   213  	ret := &workerManageTestSuite{
   214  		meta:          cli,
   215  		masterNode:    "executor-0",
   216  		messageSender: p2p.NewMockMessageSender(),
   217  		clock:         clock.NewMock(),
   218  		events:        make(map[frameModel.WorkerID]*masterEvent),
   219  	}
   220  	masterID := "master-1"
   221  	logger := logutil.WithMasterID(log.L(), masterID)
   222  	manager := NewWorkerManager(
   223  		masterID,
   224  		1,
   225  		ret.meta,
   226  		ret.messageSender,
   227  		ret.onWorkerOnline,
   228  		ret.onWorkerOffline,
   229  		ret.onWorkerStatusUpdated,
   230  		ret.onWorkerDispatched,
   231  		isInit,
   232  		config.DefaultTimeoutConfig(),
   233  		ret.clock).
   234  		WithLogger(logger)
   235  	ret.manager = manager
   236  	return ret
   237  }
   238  
   239  func TestCreateWorkerAndWorkerOnline(t *testing.T) {
   240  	t.Parallel()
   241  
   242  	suite := NewWorkerManageTestSuite(true)
   243  	wEpoch := int64(2)
   244  	suite.manager.BeforeStartingWorker("worker-1", "executor-1", wEpoch)
   245  
   246  	suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", false)
   247  	suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", false)
   248  	suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", false)
   249  
   250  	event := suite.WaitForEvent(t, "worker-1")
   251  	require.Equal(t, workerOnlineEvent, event.Tp)
   252  	suite.Close()
   253  }
   254  
   255  func TestCreateWorkerAndWorkerTimesOut(t *testing.T) {
   256  	t.Parallel()
   257  
   258  	suite := NewWorkerManageTestSuite(true)
   259  	suite.manager.BeforeStartingWorker("worker-1", "executor-1", 2)
   260  	suite.AdvanceClockBy(30 * time.Second)
   261  	suite.AdvanceClockBy(30 * time.Second)
   262  	suite.AdvanceClockBy(30 * time.Second)
   263  
   264  	event := suite.WaitForEvent(t, "worker-1")
   265  	require.Equal(t, workerOfflineEvent, event.Tp)
   266  	require.NotNil(t, event.Handle.GetTombstone())
   267  
   268  	suite.AssertNoEvents(t, "worker-1", 500*time.Millisecond)
   269  	suite.Close()
   270  }
   271  
   272  func TestCreateWorkerPredispatchFailed(t *testing.T) {
   273  	t.Parallel()
   274  
   275  	suite := NewWorkerManageTestSuite(true)
   276  	suite.manager.AbortCreatingWorker("worker-1", errors.New("injected error"))
   277  
   278  	event := suite.WaitForEvent(t, "worker-1")
   279  	require.Equal(t, workerDispatchFailedEvent, event.Tp)
   280  	require.NotNil(t, event.Handle.GetTombstone())
   281  	require.Error(t, event.Err)
   282  	require.Regexp(t, ".*injected error.*", event.Err)
   283  
   284  	suite.AssertNoEvents(t, "worker-1", 500*time.Millisecond)
   285  	suite.Close()
   286  }
   287  
   288  func TestCreateWorkerAndWorkerStatusUpdatedAndTimesOut(t *testing.T) {
   289  	t.Parallel()
   290  
   291  	suite := NewWorkerManageTestSuite(true)
   292  	wEpoch := int64(2)
   293  	suite.manager.BeforeStartingWorker("worker-1", "executor-1", wEpoch)
   294  
   295  	suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", false)
   296  
   297  	event := suite.WaitForEvent(t, "worker-1")
   298  	require.Equal(t, workerOnlineEvent, event.Tp)
   299  
   300  	err := suite.SimulateWorkerUpdateStatus("worker-1", &frameModel.WorkerStatus{
   301  		State: frameModel.WorkerStateFinished,
   302  	}, 1)
   303  	require.NoError(t, err)
   304  
   305  	event = suite.WaitForEvent(t, "worker-1")
   306  	require.Equal(t, workerStatusUpdatedEvent, event.Tp)
   307  	require.Equal(t, frameModel.WorkerStateFinished, event.Handle.Status().State)
   308  
   309  	suite.AdvanceClockBy(30 * time.Second)
   310  	event = suite.WaitForEvent(t, "worker-1")
   311  	require.Equal(t, workerOfflineEvent, event.Tp)
   312  	require.NotNil(t, event.Handle.GetTombstone())
   313  	require.True(t, errors.Is(event.Err, errors.ErrWorkerFinish))
   314  
   315  	suite.Close()
   316  }
   317  
   318  func TestRecoverAfterFailover(t *testing.T) {
   319  	t.Parallel()
   320  
   321  	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
   322  	defer cancel()
   323  
   324  	suite := NewWorkerManageTestSuite(false)
   325  	err := suite.PutMeta("worker-1", &frameModel.WorkerStatus{
   326  		State: frameModel.WorkerStateNormal,
   327  		Epoch: 11,
   328  	})
   329  	require.NoError(t, err)
   330  	err = suite.PutMeta("worker-2", &frameModel.WorkerStatus{
   331  		State: frameModel.WorkerStateNormal,
   332  		Epoch: 12,
   333  	})
   334  	require.NoError(t, err)
   335  	err = suite.PutMeta("worker-3", &frameModel.WorkerStatus{
   336  		State: frameModel.WorkerStateNormal,
   337  		Epoch: 13,
   338  	})
   339  	require.NoError(t, err)
   340  	err = suite.PutMeta("worker-4", &frameModel.WorkerStatus{
   341  		State: frameModel.WorkerStateNormal,
   342  		Epoch: 14,
   343  	})
   344  	require.NoError(t, err)
   345  
   346  	doneCh := make(chan struct{})
   347  	go func() {
   348  		defer close(doneCh)
   349  		err := suite.manager.InitAfterRecover(ctx)
   350  		require.NoError(t, err)
   351  	}()
   352  
   353  	require.Eventually(t, func() bool {
   354  		suite.SimulateHeartbeat("worker-1", 1, 11, "executor-1", false)
   355  		suite.SimulateHeartbeat("worker-2", 1, 12, "executor-2", false)
   356  		suite.SimulateHeartbeat("worker-3", 1, 13, "executor-3", false)
   357  
   358  		select {
   359  		case <-doneCh:
   360  			return true
   361  		default:
   362  		}
   363  		suite.AdvanceClockBy(1 * time.Second)
   364  		return false
   365  	}, 5*time.Second, 10*time.Millisecond)
   366  
   367  	require.True(t, suite.manager.IsInitialized())
   368  	require.Len(t, suite.manager.GetWorkers(), 4)
   369  	require.Contains(t, suite.manager.GetWorkers(), "worker-1")
   370  	require.Contains(t, suite.manager.GetWorkers(), "worker-2")
   371  	require.Contains(t, suite.manager.GetWorkers(), "worker-3")
   372  	require.Contains(t, suite.manager.GetWorkers(), "worker-4")
   373  	require.Nil(t, suite.manager.GetWorkers()["worker-1"].GetTombstone())
   374  	require.Nil(t, suite.manager.GetWorkers()["worker-2"].GetTombstone())
   375  	require.Nil(t, suite.manager.GetWorkers()["worker-3"].GetTombstone())
   376  	require.NotNil(t, suite.manager.GetWorkers()["worker-4"].GetTombstone())
   377  	suite.Close()
   378  }
   379  
   380  func TestRecoverAfterFailoverFast(t *testing.T) {
   381  	t.Parallel()
   382  
   383  	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
   384  	defer cancel()
   385  
   386  	suite := NewWorkerManageTestSuite(false)
   387  	wEpoch := int64(100)
   388  	err := suite.PutMeta("worker-1", &frameModel.WorkerStatus{
   389  		State: frameModel.WorkerStateNormal,
   390  		Epoch: wEpoch,
   391  	})
   392  	require.NoError(t, err)
   393  
   394  	doneCh := make(chan struct{})
   395  	go func() {
   396  		defer close(doneCh)
   397  		err := suite.manager.InitAfterRecover(ctx)
   398  		require.NoError(t, err)
   399  	}()
   400  
   401  	require.Eventually(t, func() bool {
   402  		suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", false)
   403  		select {
   404  		case <-doneCh:
   405  			return true
   406  		default:
   407  		}
   408  		return false
   409  	}, 1*time.Second, 10*time.Millisecond)
   410  
   411  	require.True(t, suite.manager.IsInitialized())
   412  	require.Len(t, suite.manager.GetWorkers(), 1)
   413  	require.Contains(t, suite.manager.GetWorkers(), "worker-1")
   414  	suite.Close()
   415  }
   416  
   417  func TestRecoverWithNoWorker(t *testing.T) {
   418  	t.Parallel()
   419  
   420  	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
   421  	defer cancel()
   422  
   423  	suite := NewWorkerManageTestSuite(false)
   424  
   425  	// Since there is no worker info in the metastore,
   426  	// recovering should be very fast.
   427  	// Since we are using a mock clock, and we are NOT advancing it,
   428  	// InitAfterRecover returning at all would indicate a successful test.
   429  	err := suite.manager.InitAfterRecover(ctx)
   430  	require.NoError(t, err)
   431  
   432  	suite.Close()
   433  }
   434  
   435  func TestCleanTombstone(t *testing.T) {
   436  	t.Parallel()
   437  
   438  	ctx := context.Background()
   439  
   440  	suite := NewWorkerManageTestSuite(true)
   441  	suite.manager.BeforeStartingWorker("worker-1", "executor-1", 2)
   442  	suite.AdvanceClockBy(30 * time.Second)
   443  	suite.AdvanceClockBy(30 * time.Second)
   444  	suite.AdvanceClockBy(30 * time.Second)
   445  
   446  	event := suite.WaitForEvent(t, "worker-1")
   447  	require.Equal(t, workerOfflineEvent, event.Tp)
   448  	require.NotNil(t, event.Handle.GetTombstone())
   449  	err := event.Handle.GetTombstone().CleanTombstone(ctx)
   450  	require.NoError(t, err)
   451  
   452  	workerMetaClient := metadata.NewWorkerStatusClient("master-1", suite.meta)
   453  	_, err = workerMetaClient.Load(ctx, "worker-1")
   454  	// Asserts that the meta for the worker is indeed deleted.
   455  	require.Error(t, err)
   456  	require.Regexp(t, ".*ErrMetaEntryNotFound", err)
   457  
   458  	// CleanTombstone should be idempotent for robustness.
   459  	err = event.Handle.GetTombstone().CleanTombstone(ctx)
   460  	require.NoError(t, err)
   461  
   462  	// Recreating a worker with the same name should work fine.
   463  	suite.manager.BeforeStartingWorker("worker-1", "executor-1", 10)
   464  
   465  	suite.Close()
   466  }
   467  
   468  func TestWorkerGracefulExit(t *testing.T) {
   469  	t.Parallel()
   470  
   471  	suite := NewWorkerManageTestSuite(true)
   472  	wEpoch := int64(2)
   473  	suite.manager.BeforeStartingWorker("worker-1", "executor-1", wEpoch)
   474  
   475  	suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", false)
   476  	suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", false)
   477  	suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", false)
   478  
   479  	event := suite.WaitForEvent(t, "worker-1")
   480  	require.Equal(t, workerOnlineEvent, event.Tp)
   481  
   482  	suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", true)
   483  	event = suite.WaitForEvent(t, "worker-1")
   484  	require.Equal(t, workerOfflineEvent, event.Tp)
   485  
   486  	suite.Close()
   487  }
   488  
   489  func TestWorkerGracefulExitOnFirstHeartbeat(t *testing.T) {
   490  	t.Parallel()
   491  
   492  	suite := NewWorkerManageTestSuite(true)
   493  	wEpoch := int64(2)
   494  	suite.manager.BeforeStartingWorker("worker-1", "executor-1", wEpoch)
   495  
   496  	suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", true)
   497  
   498  	// Now we expect there to be both workerOnlineEvent and workerOfflineEvent,
   499  	// in that order.
   500  	event := suite.WaitForEvent(t, "worker-1")
   501  	require.Equal(t, workerOnlineEvent, event.Tp)
   502  	event = suite.WaitForEvent(t, "worker-1")
   503  	require.Equal(t, workerOfflineEvent, event.Tp)
   504  
   505  	suite.Close()
   506  }
   507  
   508  func TestWorkerGracefulExitAfterFailover(t *testing.T) {
   509  	t.Parallel()
   510  
   511  	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
   512  	defer cancel()
   513  
   514  	suite := NewWorkerManageTestSuite(false)
   515  	wEpoch := int64(2)
   516  	err := suite.PutMeta("worker-1", &frameModel.WorkerStatus{
   517  		State: frameModel.WorkerStateNormal,
   518  		Epoch: wEpoch,
   519  	})
   520  	require.NoError(t, err)
   521  
   522  	doneCh := make(chan struct{})
   523  	go func() {
   524  		defer close(doneCh)
   525  		err := suite.manager.InitAfterRecover(ctx)
   526  		require.NoError(t, err)
   527  	}()
   528  
   529  	require.Eventually(t, func() bool {
   530  		suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", true)
   531  		select {
   532  		case <-doneCh:
   533  			return true
   534  		default:
   535  		}
   536  		suite.AdvanceClockBy(1 * time.Second)
   537  		return false
   538  	}, 1*time.Second, 10*time.Millisecond)
   539  
   540  	require.True(t, suite.manager.IsInitialized())
   541  	require.Len(t, suite.manager.GetWorkers(), 1)
   542  	require.Contains(t, suite.manager.GetWorkers(), "worker-1")
   543  	require.NotNil(t, suite.manager.GetWorkers()["worker-1"].GetTombstone())
   544  	suite.Close()
   545  }
   546  
   547  func TestWorkerSendsStaleHeartbeat(t *testing.T) {
   548  	t.Parallel()
   549  
   550  	suite := NewWorkerManageTestSuite(true)
   551  	wEpoch := int64(2)
   552  	suite.manager.BeforeStartingWorker("worker-1", "executor-1", wEpoch)
   553  
   554  	suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", false)
   555  	suite.SimulateHeartbeat("worker-1", 1, wEpoch, "executor-1", false)
   556  
   557  	event := suite.WaitForEvent(t, "worker-1")
   558  	require.Equal(t, workerOnlineEvent, event.Tp)
   559  
   560  	ctx, cancel := context.WithCancel(context.Background())
   561  	var wg sync.WaitGroup
   562  	wg.Add(1)
   563  	go func() {
   564  		defer wg.Done()
   565  		for {
   566  			select {
   567  			case <-ctx.Done():
   568  				return
   569  			case <-time.After(time.Millisecond * 20):
   570  				suite.SimulateHeartbeat("worker-1", 1, wEpoch-1, "executor-1", false)
   571  			}
   572  		}
   573  	}()
   574  
   575  	event = suite.WaitForEvent(t, "worker-1")
   576  	require.Equal(t, workerOfflineEvent, event.Tp)
   577  
   578  	suite.Close()
   579  	cancel()
   580  	wg.Wait()
   581  }