github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/servermaster/jobmanager_test.go (about)

     1  // Copyright 2022 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package servermaster
    15  
    16  import (
    17  	"context"
    18  	"fmt"
    19  	"math/rand"
    20  	"testing"
    21  	"time"
    22  
    23  	"github.com/golang/mock/gomock"
    24  	pb "github.com/pingcap/tiflow/engine/enginepb"
    25  	"github.com/pingcap/tiflow/engine/framework"
    26  	"github.com/pingcap/tiflow/engine/framework/metadata"
    27  	frameModel "github.com/pingcap/tiflow/engine/framework/model"
    28  	"github.com/pingcap/tiflow/engine/pkg/clock"
    29  	"github.com/pingcap/tiflow/engine/pkg/ctxmu"
    30  	resManager "github.com/pingcap/tiflow/engine/pkg/externalresource/manager"
    31  	jobMock "github.com/pingcap/tiflow/engine/pkg/httputil/mock"
    32  	"github.com/pingcap/tiflow/engine/pkg/notifier"
    33  	"github.com/pingcap/tiflow/engine/pkg/openapi"
    34  	pkgOrm "github.com/pingcap/tiflow/engine/pkg/orm"
    35  	"github.com/pingcap/tiflow/engine/servermaster/jobop"
    36  	jobopMock "github.com/pingcap/tiflow/engine/servermaster/jobop/mock"
    37  	"github.com/pingcap/tiflow/pkg/errors"
    38  	"github.com/pingcap/tiflow/pkg/label"
    39  	"github.com/pingcap/tiflow/pkg/notify"
    40  	"github.com/pingcap/tiflow/pkg/uuid"
    41  	"github.com/stretchr/testify/mock"
    42  	"github.com/stretchr/testify/require"
    43  	"go.uber.org/atomic"
    44  	"golang.org/x/sync/errgroup"
    45  	"google.golang.org/protobuf/proto"
    46  )
    47  
    48  func prepareMockJobManager(
    49  	ctx context.Context, t *testing.T, masterID string,
    50  ) (*framework.MockMasterImpl, *JobManagerImpl) {
    51  	mockMaster := framework.NewMockMasterImpl(t, "", masterID)
    52  	framework.MockMasterPrepareMeta(ctx, t, mockMaster)
    53  	mgr := &JobManagerImpl{
    54  		BaseMaster:          mockMaster.DefaultBaseMaster,
    55  		JobFsm:              NewJobFsm(),
    56  		clocker:             clock.New(),
    57  		uuidGen:             uuid.NewGenerator(),
    58  		frameMetaClient:     mockMaster.GetFrameMetaClient(),
    59  		masterMetaClient:    metadata.NewMasterMetadataClient(metadata.JobManagerUUID, mockMaster.GetFrameMetaClient()),
    60  		jobStatusChangeMu:   ctxmu.New(),
    61  		notifier:            notifier.NewNotifier[resManager.JobStatusChangeEvent](),
    62  		jobOperatorNotifier: new(notify.Notifier),
    63  		jobHTTPClient:       jobMock.NewMockNilReturnJobHTTPClient(),
    64  	}
    65  	return mockMaster, mgr
    66  }
    67  
    68  func TestJobManagerCreateJob(t *testing.T) {
    69  	t.Parallel()
    70  
    71  	ctx, cancel := context.WithCancel(context.Background())
    72  	defer cancel()
    73  
    74  	masterID := "create-job-test"
    75  	mockMaster, mgr := prepareMockJobManager(ctx, t, masterID)
    76  	mockMaster.On("InitImpl", mock.Anything).Return(nil)
    77  	mockMaster.MasterClient().EXPECT().ScheduleTask(
    78  		gomock.Any(),
    79  		gomock.Any()).Return(&pb.ScheduleTaskResponse{}, errors.ErrClusterResourceNotEnough.FastGenByArgs()).Times(1)
    80  	wg, ctx := errgroup.WithContext(ctx)
    81  	mgr.wg = wg
    82  	// set master impl to JobManagerImpl
    83  	mockMaster.Impl = mgr
    84  	err := mockMaster.Init(ctx)
    85  	require.Nil(t, err)
    86  	req := &pb.CreateJobRequest{
    87  		Job: &pb.Job{
    88  			Type:   pb.Job_CVSDemo,
    89  			Config: []byte("{\"srcHost\":\"0.0.0.0:1234\", \"dstHost\":\"0.0.0.0:1234\", \"srcDir\":\"data\", \"dstDir\":\"data1\"}"),
    90  		},
    91  	}
    92  	job, err := mgr.CreateJob(ctx, req)
    93  	require.NoError(t, err)
    94  
    95  	require.Eventually(t, func() bool {
    96  		return mgr.JobFsm.QueryJob(job.Id) != nil
    97  	}, time.Second*2, time.Millisecond*20)
    98  
    99  	// Create a new job with the same id.
   100  	req = &pb.CreateJobRequest{
   101  		Job: &pb.Job{
   102  			Id:     job.Id,
   103  			Type:   pb.Job_CVSDemo,
   104  			Config: []byte("{\"srcHost\":\"0.0.0.0:1234\", \"dstHost\":\"0.0.0.0:1234\", \"srcDir\":\"data\", \"dstDir\":\"data1\"}"),
   105  		},
   106  	}
   107  	_, err = mgr.CreateJob(ctx, req)
   108  	require.True(t, errors.Is(err, errors.ErrJobAlreadyExists))
   109  
   110  	// delete a finished job, re-create job with the same id will meet error
   111  	err = mockMaster.GetFrameMetaClient().UpdateJob(ctx, job.Id,
   112  		map[string]interface{}{
   113  			"state": frameModel.MasterStateFinished,
   114  		},
   115  	)
   116  	require.NoError(t, err)
   117  	_, err = mgr.DeleteJob(ctx, &pb.DeleteJobRequest{Id: job.Id})
   118  	require.NoError(t, err)
   119  	_, err = mgr.CreateJob(ctx, req)
   120  	require.True(t, errors.Is(err, errors.ErrJobAlreadyExists))
   121  }
   122  
   123  type mockBaseMasterCreateWorkerFailed struct {
   124  	*framework.MockMasterImpl
   125  }
   126  
   127  func (m *mockBaseMasterCreateWorkerFailed) CreateWorker(
   128  	workerType framework.WorkerType,
   129  	config framework.WorkerConfig,
   130  	opts ...framework.CreateWorkerOpt,
   131  ) (frameModel.WorkerID, error) {
   132  	return "", errors.ErrMasterConcurrencyExceeded.FastGenByArgs()
   133  }
   134  
   135  func TestCreateWorkerReturnError(t *testing.T) {
   136  	t.Parallel()
   137  
   138  	ctx, cancel := context.WithCancel(context.Background())
   139  	defer cancel()
   140  
   141  	masterImpl := framework.NewMockMasterImpl(t, "", "create-worker-with-error")
   142  	framework.MockMasterPrepareMeta(ctx, t, masterImpl)
   143  	mockMaster := &mockBaseMasterCreateWorkerFailed{
   144  		MockMasterImpl: masterImpl,
   145  	}
   146  	mgr := &JobManagerImpl{
   147  		BaseMaster:      mockMaster,
   148  		JobFsm:          NewJobFsm(),
   149  		uuidGen:         uuid.NewGenerator(),
   150  		frameMetaClient: mockMaster.GetFrameMetaClient(),
   151  	}
   152  	mockMaster.Impl = mgr
   153  	err := mockMaster.Init(ctx)
   154  	require.Nil(t, err)
   155  	req := &pb.CreateJobRequest{
   156  		Job: &pb.Job{
   157  			Type:   pb.Job_CVSDemo,
   158  			Config: []byte("{\"srcHost\":\"0.0.0.0:1234\", \"dstHost\":\"0.0.0.0:1234\", \"srcDir\":\"data\", \"dstDir\":\"data1\"}"),
   159  		},
   160  	}
   161  	_, err = mgr.CreateJob(ctx, req)
   162  	require.Error(t, err)
   163  	require.Contains(t, err.Error(), "ErrMasterConcurrencyExceeded")
   164  }
   165  
   166  func TestJobManagerCancelJob(t *testing.T) {
   167  	t.Parallel()
   168  
   169  	ctx, cancel := context.WithCancel(context.Background())
   170  	defer cancel()
   171  
   172  	masterID := "cancel-job-test"
   173  	mockMaster, mgr := prepareMockJobManager(ctx, t, masterID)
   174  	mockMaster.On("InitImpl", mock.Anything).Return(nil)
   175  	mgr.jobOperator = jobop.NewJobOperatorImpl(mgr.frameMetaClient, mgr)
   176  
   177  	cancelWorkerID := "cancel-worker-id"
   178  	meta := &frameModel.MasterMeta{
   179  		ID:    cancelWorkerID,
   180  		Type:  frameModel.CvsJobMaster,
   181  		State: frameModel.MasterStateInit,
   182  	}
   183  	mgr.JobFsm.JobDispatched(meta, false)
   184  
   185  	err := mgr.frameMetaClient.UpsertJob(ctx, meta)
   186  	require.NoError(t, err)
   187  	mockWorkerHandle := &framework.MockHandle{WorkerID: cancelWorkerID, ExecutorID: "executor-1"}
   188  	err = mgr.JobFsm.JobOnline(mockWorkerHandle)
   189  	require.NoError(t, err)
   190  
   191  	req := &pb.CancelJobRequest{
   192  		Id: cancelWorkerID,
   193  	}
   194  	job, err := mgr.CancelJob(ctx, req)
   195  	require.NoError(t, err)
   196  	require.Equal(t, pb.Job_Canceling, job.State)
   197  
   198  	for i := 0; i < 5; i++ {
   199  		err = mgr.jobOperator.Tick(ctx)
   200  		require.NoError(t, err)
   201  		require.Equal(t, i+1, mockWorkerHandle.SendMessageCount())
   202  	}
   203  
   204  	req.Id = cancelWorkerID + "-unknown"
   205  	_, err = mgr.CancelJob(ctx, req)
   206  	require.Error(t, err)
   207  	require.True(t, errors.Is(err, errors.ErrJobNotFound))
   208  }
   209  
   210  func TestJobManagerDeleteJob(t *testing.T) {
   211  	t.Parallel()
   212  
   213  	ctx, cancel := context.WithCancel(context.Background())
   214  	defer cancel()
   215  
   216  	masterID := "delete-job-test"
   217  	mockMaster, mgr := prepareMockJobManager(ctx, t, masterID)
   218  	mockMaster.On("InitImpl", mock.Anything).Return(nil)
   219  
   220  	err := mgr.frameMetaClient.UpsertJob(ctx, &frameModel.MasterMeta{
   221  		ID:    "job-to-be-deleted",
   222  		Type:  frameModel.FakeJobMaster,
   223  		State: frameModel.MasterStateStopped,
   224  	})
   225  	require.NoError(t, err)
   226  
   227  	err = mgr.OnMasterRecovered(ctx)
   228  	require.NoError(t, err)
   229  
   230  	_, err = mgr.DeleteJob(ctx, &pb.DeleteJobRequest{
   231  		Id: "job-to-be-deleted",
   232  	})
   233  	require.NoError(t, err)
   234  	_, err = mgr.frameMetaClient.GetJobByID(ctx, "job-to-be-deleted")
   235  	require.True(t, pkgOrm.IsNotFoundError(err))
   236  }
   237  
   238  func TestJobManagerGetJob(t *testing.T) {
   239  	t.Parallel()
   240  
   241  	ctx, cancel := context.WithCancel(context.Background())
   242  	defer cancel()
   243  
   244  	testCases := []struct {
   245  		meta             *frameModel.MasterMeta
   246  		expectedPBStatus pb.Job_State
   247  	}{
   248  		{
   249  			&frameModel.MasterMeta{
   250  				ID:    "master-1",
   251  				Type:  frameModel.FakeJobMaster,
   252  				State: frameModel.MasterStateUninit,
   253  			},
   254  			pb.Job_Created,
   255  		},
   256  		{
   257  			&frameModel.MasterMeta{
   258  				ID:    "master-2",
   259  				Type:  frameModel.FakeJobMaster,
   260  				State: frameModel.MasterStateInit,
   261  			},
   262  			pb.Job_Running,
   263  		},
   264  		{
   265  			&frameModel.MasterMeta{
   266  				ID:    "master-3",
   267  				Type:  frameModel.FakeJobMaster,
   268  				State: frameModel.MasterStateFinished,
   269  			},
   270  			pb.Job_Finished,
   271  		},
   272  		{
   273  			&frameModel.MasterMeta{
   274  				ID:    "master-4",
   275  				Type:  frameModel.FakeJobMaster,
   276  				State: frameModel.MasterStateStopped,
   277  			},
   278  			pb.Job_Canceled,
   279  		},
   280  	}
   281  
   282  	mockMaster := framework.NewMockMasterImpl(t, "", "job-manager-get-job-test")
   283  	framework.MockMasterPrepareMeta(ctx, t, mockMaster)
   284  	for _, tc := range testCases {
   285  		cli := metadata.NewMasterMetadataClient(tc.meta.ID, mockMaster.GetFrameMetaClient())
   286  		err := cli.Store(ctx, tc.meta)
   287  		require.Nil(t, err)
   288  	}
   289  
   290  	mgr := &JobManagerImpl{
   291  		BaseMaster:       mockMaster.DefaultBaseMaster,
   292  		JobFsm:           NewJobFsm(),
   293  		uuidGen:          uuid.NewGenerator(),
   294  		masterMetaClient: metadata.NewMasterMetadataClient(metadata.JobManagerUUID, mockMaster.GetFrameMetaClient()),
   295  		frameMetaClient:  mockMaster.GetFrameMetaClient(),
   296  		jobHTTPClient:    jobMock.NewMockNilReturnJobHTTPClient(),
   297  	}
   298  
   299  	statuses, err := mgr.GetJobStatuses(ctx)
   300  	require.NoError(t, err)
   301  	require.Len(t, statuses, len(testCases)+1)
   302  
   303  	for _, tc := range testCases {
   304  		req := &pb.GetJobRequest{
   305  			Id: tc.meta.ID,
   306  		}
   307  		job, err := mgr.GetJob(ctx, req)
   308  		require.NoError(t, err)
   309  		require.Equal(t, tc.expectedPBStatus, job.GetState())
   310  
   311  		require.Contains(t, statuses, tc.meta.ID)
   312  		require.Equal(t, tc.meta.State, statuses[tc.meta.ID])
   313  	}
   314  }
   315  
   316  func TestJobManagerOnlineJob(t *testing.T) {
   317  	t.Parallel()
   318  
   319  	ctx, cancel := context.WithCancel(context.Background())
   320  	defer cancel()
   321  
   322  	mockMaster := framework.NewMockMasterImpl(t, "", "submit-job-test")
   323  	framework.MockMasterPrepareMeta(ctx, t, mockMaster)
   324  	mockMaster.On("InitImpl", mock.Anything).Return(nil)
   325  	mockMaster.MasterClient().EXPECT().ScheduleTask(gomock.Any(), gomock.Any()).
   326  		Return(&pb.ScheduleTaskResponse{}, errors.ErrClusterResourceNotEnough.FastGenByArgs()).MinTimes(0)
   327  	mgr := &JobManagerImpl{
   328  		BaseMaster:        mockMaster.DefaultBaseMaster,
   329  		JobFsm:            NewJobFsm(),
   330  		uuidGen:           uuid.NewGenerator(),
   331  		frameMetaClient:   mockMaster.GetFrameMetaClient(),
   332  		jobStatusChangeMu: ctxmu.New(),
   333  	}
   334  	// set master impl to JobManagerImpl
   335  	mockMaster.Impl = mgr
   336  	err := mockMaster.Init(ctx)
   337  	require.Nil(t, err)
   338  	req := &pb.CreateJobRequest{
   339  		Job: &pb.Job{
   340  			Type:   pb.Job_CVSDemo,
   341  			Config: []byte("{\"srcHost\":\"0.0.0.0:1234\", \"dstHost\":\"0.0.0.0:1234\", \"srcDir\":\"data\", \"dstDir\":\"data1\"}"),
   342  		},
   343  	}
   344  	job, err := mgr.CreateJob(ctx, req)
   345  	require.NoError(t, err)
   346  
   347  	err = mgr.JobFsm.JobOnline(&framework.MockHandle{
   348  		WorkerID:   job.Id,
   349  		ExecutorID: "executor-1",
   350  	})
   351  	require.NoError(t, err)
   352  	require.Len(t, mgr.JobFsm.waitAckJobs, 0)
   353  	require.Len(t, mgr.JobFsm.onlineJobs, 1)
   354  }
   355  
   356  func TestJobManagerRecover(t *testing.T) {
   357  	t.Parallel()
   358  
   359  	ctx, cancel := context.WithCancel(context.Background())
   360  	defer cancel()
   361  
   362  	mockMaster := framework.NewMockMasterImpl(t, "", "job-manager-recover-test")
   363  	framework.MockMasterPrepareMeta(ctx, t, mockMaster)
   364  	// prepare mockvk with two job masters
   365  	meta := []*frameModel.MasterMeta{
   366  		{
   367  			ID:   "master-1",
   368  			Type: frameModel.FakeJobMaster,
   369  		},
   370  		{
   371  			ID:   "master-2",
   372  			Type: frameModel.FakeJobMaster,
   373  		},
   374  	}
   375  	for _, data := range meta {
   376  		cli := metadata.NewMasterMetadataClient(data.ID, mockMaster.GetFrameMetaClient())
   377  		err := cli.Store(ctx, data)
   378  		require.Nil(t, err)
   379  	}
   380  
   381  	mgr := &JobManagerImpl{
   382  		BaseMaster:       mockMaster.DefaultBaseMaster,
   383  		JobFsm:           NewJobFsm(),
   384  		uuidGen:          uuid.NewGenerator(),
   385  		masterMetaClient: metadata.NewMasterMetadataClient(metadata.JobManagerUUID, mockMaster.GetFrameMetaClient()),
   386  		frameMetaClient:  mockMaster.GetFrameMetaClient(),
   387  		jobHTTPClient:    jobMock.NewMockNilReturnJobHTTPClient(),
   388  	}
   389  	err := mgr.OnMasterRecovered(ctx)
   390  	require.NoError(t, err)
   391  	require.Len(t, mgr.JobFsm.waitAckJobs, 3)
   392  }
   393  
   394  func TestJobManagerTickExceedQuota(t *testing.T) {
   395  	t.Parallel()
   396  
   397  	ctx, cancel := context.WithCancel(context.Background())
   398  	defer cancel()
   399  
   400  	masterImpl := framework.NewMockMasterImpl(t, "", "create-worker-with-error")
   401  	framework.MockMasterPrepareMeta(ctx, t, masterImpl)
   402  	mockMaster := &mockBaseMasterCreateWorkerFailed{
   403  		MockMasterImpl: masterImpl,
   404  	}
   405  	mgr := &JobManagerImpl{
   406  		BaseMaster:      mockMaster,
   407  		JobFsm:          NewJobFsm(),
   408  		uuidGen:         uuid.NewGenerator(),
   409  		frameMetaClient: mockMaster.GetFrameMetaClient(),
   410  		jobHTTPClient:   jobMock.NewMockNilReturnJobHTTPClient(),
   411  	}
   412  	mockMaster.Impl = mgr
   413  	err := mockMaster.Init(ctx)
   414  	require.NoError(t, err)
   415  
   416  	mgr.JobFsm.JobDispatched(&frameModel.MasterMeta{ID: "failover-job-master"}, true)
   417  	// try to recreate failover job master, will meet quota error
   418  	err = mgr.Tick(ctx)
   419  	require.NoError(t, err)
   420  	require.Len(t, mgr.JobFsm.waitAckJobs, 1)
   421  
   422  	// try to recreate failover job master again, will meet quota error again
   423  	err = mgr.Tick(ctx)
   424  	require.NoError(t, err)
   425  	require.Len(t, mgr.JobFsm.waitAckJobs, 1)
   426  }
   427  
   428  func TestJobManagerWatchJobStatuses(t *testing.T) {
   429  	t.Parallel()
   430  
   431  	ctx, cancel := context.WithCancel(context.Background())
   432  	defer cancel()
   433  
   434  	masterID := "delete-job-test"
   435  	mockMaster, mgr := prepareMockJobManager(ctx, t, masterID)
   436  	mockMaster.On("InitImpl", mock.Anything).Return(nil)
   437  
   438  	err := mgr.frameMetaClient.UpsertJob(ctx, &frameModel.MasterMeta{
   439  		ID:    "job-to-be-deleted",
   440  		Type:  frameModel.FakeJobMaster,
   441  		State: frameModel.MasterStateStopped,
   442  	})
   443  	require.NoError(t, err)
   444  
   445  	err = mgr.OnMasterRecovered(ctx)
   446  	require.NoError(t, err)
   447  
   448  	snap, stream, err := mgr.WatchJobStatuses(ctx)
   449  	require.NoError(t, err)
   450  	require.Equal(t, map[frameModel.MasterID]frameModel.MasterState{
   451  		"delete-job-test":   frameModel.MasterStateUninit,
   452  		"job-to-be-deleted": frameModel.MasterStateStopped,
   453  	}, snap)
   454  
   455  	_, err = mgr.DeleteJob(ctx, &pb.DeleteJobRequest{
   456  		Id: "job-to-be-deleted",
   457  	})
   458  	require.NoError(t, err)
   459  
   460  	event := <-stream.C
   461  	require.Equal(t, resManager.JobStatusChangeEvent{
   462  		EventType: resManager.JobRemovedEvent,
   463  		JobID:     "job-to-be-deleted",
   464  	}, event)
   465  }
   466  
   467  func TestGetJobDetailFromJobMaster(t *testing.T) {
   468  	t.Parallel()
   469  
   470  	ctx := context.TODO()
   471  	masterID := "get-job-detail"
   472  	mockMaster, mgr := prepareMockJobManager(ctx, t, masterID)
   473  	mockMaster.On("InitImpl", mock.Anything).Return(nil)
   474  
   475  	mockCtrl := gomock.NewController(t)
   476  	defer mockCtrl.Finish()
   477  	mockJobClient := jobMock.NewMockJobHTTPClient(mockCtrl)
   478  	mgr.jobHTTPClient = mockJobClient
   479  
   480  	masterMeta := &frameModel.MasterMeta{
   481  		ID:   "new-job",
   482  		Type: frameModel.FakeJobMaster,
   483  		// set state to running
   484  		State:    frameModel.MasterStateInit,
   485  		Addr:     "127.0.0.1:10340",
   486  		ErrorMsg: "error_message",
   487  	}
   488  
   489  	// normal case, return job detail
   490  	err := mgr.frameMetaClient.UpsertJob(ctx, masterMeta)
   491  	require.NoError(t, err)
   492  
   493  	mgr.JobFsm.JobDispatched(masterMeta, false)
   494  	err = mgr.JobFsm.JobOnline(&framework.MockHandle{
   495  		WorkerID:   "new-job",
   496  		ExecutorID: "executor-1",
   497  	})
   498  	require.NoError(t, err)
   499  
   500  	mockJobClient.EXPECT().GetJobDetail(ctx, "127.0.0.1:10340", "new-job").Return([]byte("detail test"), nil).Times(1)
   501  	job, err := mgr.GetJob(ctx, &pb.GetJobRequest{Id: "new-job"})
   502  	require.NoError(t, err)
   503  	require.True(t, proto.Equal(&pb.Job{
   504  		Id:     "new-job",
   505  		Type:   pb.Job_FakeJob,
   506  		State:  pb.Job_Running,
   507  		Detail: []byte("detail test"),
   508  		Error: &pb.Job_Error{
   509  			Message: "error_message",
   510  		},
   511  	}, job))
   512  
   513  	// get job detail failed
   514  	err = mgr.frameMetaClient.UpsertJob(ctx, &frameModel.MasterMeta{
   515  		ID:   "new-job",
   516  		Type: frameModel.FakeJobMaster,
   517  		// set status code to running state
   518  		State:    frameModel.MasterStateInit,
   519  		Addr:     "127.0.0.1:10340",
   520  		ErrorMsg: "error_message",
   521  	})
   522  	require.NoError(t, err)
   523  
   524  	mockJobClient.EXPECT().
   525  		GetJobDetail(ctx, "127.0.0.1:10340", "new-job").
   526  		Return(nil, &openapi.HTTPError{
   527  			Code:    string(errors.ErrJobNotRunning.RFCCode()),
   528  			Message: "job new-job is not running",
   529  		}).
   530  		Times(1)
   531  	job, err = mgr.GetJob(ctx, &pb.GetJobRequest{Id: "new-job"})
   532  	require.NoError(t, err)
   533  	require.True(t, proto.Equal(&pb.Job{
   534  		Id:    "new-job",
   535  		Type:  pb.Job_FakeJob,
   536  		State: pb.Job_Running,
   537  		Error: &pb.Job_Error{
   538  			Code:    "DFLOW:ErrJobNotRunning",
   539  			Message: "job new-job is not running",
   540  		},
   541  	}, job))
   542  }
   543  
   544  func TestListJobsPagination(t *testing.T) {
   545  	t.Parallel()
   546  
   547  	ctx, cancel := context.WithCancel(context.Background())
   548  	defer cancel()
   549  
   550  	mockMaster := framework.NewMockMasterImpl(t, "", "job-manager-list-jobs-test")
   551  	masterMeta := mockMaster.DefaultBaseMaster.MasterMeta()
   552  	masterMeta.Type = frameModel.JobManager
   553  	err := mockMaster.GetFrameMetaClient().UpsertJob(ctx, masterMeta)
   554  	require.NoError(t, err)
   555  
   556  	const totalJobCount = 2000
   557  
   558  	jobIDs := make([]string, 0, totalJobCount)
   559  	for i := 0; i < totalJobCount; i++ {
   560  		jobID := fmt.Sprintf("job-%04d", i)
   561  		jobIDs = append(jobIDs, jobID)
   562  		cli := metadata.NewMasterMetadataClient(jobID, mockMaster.GetFrameMetaClient())
   563  		require.NoError(t, cli.Store(ctx, &frameModel.MasterMeta{
   564  			ID:    jobID,
   565  			Type:  frameModel.FakeJobMaster,
   566  			State: frameModel.MasterStateStopped,
   567  		}))
   568  	}
   569  
   570  	mgr := &JobManagerImpl{
   571  		BaseMaster:       mockMaster.DefaultBaseMaster,
   572  		JobFsm:           NewJobFsm(),
   573  		uuidGen:          uuid.NewGenerator(),
   574  		masterMetaClient: metadata.NewMasterMetadataClient(metadata.JobManagerUUID, mockMaster.GetFrameMetaClient()),
   575  		frameMetaClient:  mockMaster.GetFrameMetaClient(),
   576  		jobHTTPClient:    jobMock.NewMockNilReturnJobHTTPClient(),
   577  	}
   578  
   579  	// List jobs without specifying page size.
   580  	resp, err := mgr.ListJobs(ctx, &pb.ListJobsRequest{})
   581  	require.NoError(t, err)
   582  	require.Len(t, resp.Jobs, defaultListPageSize)
   583  	for i := 0; i < defaultListPageSize; i++ {
   584  		require.Equal(t, jobIDs[i], resp.Jobs[i].Id)
   585  	}
   586  	require.Equal(t, jobIDs[defaultListPageSize-1], resp.NextPageToken)
   587  
   588  	// List jobs with huge page size.
   589  	resp, err = mgr.ListJobs(ctx, &pb.ListJobsRequest{PageSize: 10000})
   590  	require.NoError(t, err)
   591  	require.Len(t, resp.Jobs, maxListPageSize)
   592  
   593  	// List all jobs with pagination.
   594  	var (
   595  		respJobIDs    []string
   596  		nextPageToken string
   597  	)
   598  	pageSize := 123
   599  	for {
   600  		resp, err = mgr.ListJobs(ctx, &pb.ListJobsRequest{PageSize: int32(pageSize), PageToken: nextPageToken})
   601  		require.NoError(t, err)
   602  		for _, job := range resp.Jobs {
   603  			respJobIDs = append(respJobIDs, job.Id)
   604  		}
   605  		if resp.NextPageToken == "" {
   606  			break
   607  		}
   608  		nextPageToken = resp.NextPageToken
   609  	}
   610  	require.Equal(t, jobIDs, respJobIDs)
   611  }
   612  
   613  func TestListJobWithFilter(t *testing.T) {
   614  	t.Parallel()
   615  
   616  	ctx, cancel := context.WithCancel(context.Background())
   617  	defer cancel()
   618  
   619  	mockMaster := framework.NewMockMasterImpl(t, "", "job-manager-list-jobs-test")
   620  	masterMeta := mockMaster.DefaultBaseMaster.MasterMeta()
   621  	masterMeta.Type = frameModel.JobManager
   622  	err := mockMaster.GetFrameMetaClient().UpsertJob(ctx, masterMeta)
   623  	require.NoError(t, err)
   624  
   625  	allTypes := []frameModel.WorkerType{
   626  		frameModel.CvsJobMaster, frameModel.FakeJobMaster,
   627  		frameModel.DMJobMaster, frameModel.CdcJobMaster,
   628  	}
   629  	allStates := []frameModel.MasterState{
   630  		frameModel.MasterStateUninit, frameModel.MasterStateInit,
   631  		frameModel.MasterStateFinished, frameModel.MasterStateStopped, frameModel.MasterStateFailed,
   632  	}
   633  	rnd := rand.New(rand.NewSource(0))
   634  	randType := func() frameModel.WorkerType {
   635  		return allTypes[rnd.Intn(len(allTypes))]
   636  	}
   637  	randState := func() frameModel.MasterState {
   638  		return allStates[rnd.Intn(len(allStates))]
   639  	}
   640  
   641  	const totalJobCount = maxListPageSize
   642  	countByType := make(map[frameModel.WorkerType]int)
   643  	countByState := make(map[frameModel.MasterState]int)
   644  	for i := 0; i < totalJobCount; i++ {
   645  		jobID := fmt.Sprintf("job-%04d", i)
   646  		cli := metadata.NewMasterMetadataClient("job-1", mockMaster.GetFrameMetaClient())
   647  		masterMeta := &frameModel.MasterMeta{
   648  			ID:    jobID,
   649  			Type:  randType(),
   650  			State: randState(),
   651  		}
   652  		require.NoError(t, cli.Store(ctx, masterMeta))
   653  		countByType[masterMeta.Type]++
   654  		countByState[masterMeta.State]++
   655  	}
   656  
   657  	mgr := &JobManagerImpl{
   658  		BaseMaster:       mockMaster.DefaultBaseMaster,
   659  		JobFsm:           NewJobFsm(),
   660  		uuidGen:          uuid.NewGenerator(),
   661  		masterMetaClient: metadata.NewMasterMetadataClient(metadata.JobManagerUUID, mockMaster.GetFrameMetaClient()),
   662  		frameMetaClient:  mockMaster.GetFrameMetaClient(),
   663  		jobHTTPClient:    jobMock.NewMockNilReturnJobHTTPClient(),
   664  	}
   665  
   666  	// List jobs with filter.
   667  	// TODO: we should test all combinations of filters, but there's no convenient way
   668  	//  to mapping worker type to job type and master state to job state.
   669  	resp, err := mgr.ListJobs(ctx, &pb.ListJobsRequest{
   670  		PageSize: totalJobCount,
   671  		Type:     pb.Job_FakeJob,
   672  	})
   673  	require.NoError(t, err)
   674  	require.Len(t, resp.Jobs, countByType[frameModel.FakeJobMaster])
   675  
   676  	resp, err = mgr.ListJobs(ctx, &pb.ListJobsRequest{
   677  		PageSize: totalJobCount,
   678  		State:    pb.Job_Running,
   679  	})
   680  	require.NoError(t, err)
   681  	require.Len(t, resp.Jobs, countByState[frameModel.MasterStateInit])
   682  }
   683  
   684  func TestOnWorkerDispatchedFastFail(t *testing.T) {
   685  	t.Parallel()
   686  
   687  	ctx, cancel := context.WithCancel(context.Background())
   688  	defer cancel()
   689  
   690  	masterID := "job-fast-fail-test"
   691  	mockMaster, mgr := prepareMockJobManager(ctx, t, masterID)
   692  	mockMaster.On("InitImpl", mock.Anything).Return(nil)
   693  
   694  	// simulate a job is created.
   695  	mgr.JobFsm.JobDispatched(mockMaster.MasterMeta(), false)
   696  	errorMsg := "unit test fast fail error"
   697  	mockHandle := &framework.MockHandle{WorkerID: masterID}
   698  	nerr := errors.ErrCreateWorkerTerminate.GenWithStack(errorMsg)
   699  	// OnWorkerDispatched callback on job manager, a terminated error will make
   700  	// job fast fail.
   701  	err := mgr.OnWorkerDispatched(mockHandle, nerr)
   702  	require.NoError(t, err)
   703  	meta, err := mgr.frameMetaClient.QueryJobsByState(ctx,
   704  		mockMaster.MasterMeta().ProjectID, int(frameModel.MasterStateFailed))
   705  	require.NoError(t, err)
   706  	require.Len(t, meta, 1)
   707  	require.Equal(t, nerr.Error(), meta[0].ErrorMsg)
   708  }
   709  
   710  func TestJobOperatorBgLoop(t *testing.T) {
   711  	t.Parallel()
   712  
   713  	ctx, cancel := context.WithCancel(context.Background())
   714  	defer cancel()
   715  
   716  	masterID := "job-operator-bg-loop-test"
   717  	mockMaster, mgr := prepareMockJobManager(ctx, t, masterID)
   718  	mockMaster.On("InitImpl", mock.Anything).Return(nil)
   719  
   720  	mockJobOperator := jobopMock.NewMockJobOperator(gomock.NewController(t))
   721  	mgr.jobOperator = mockJobOperator
   722  
   723  	wg, ctx := errgroup.WithContext(ctx)
   724  	mgr.wg = wg
   725  	mgr.bgJobOperatorLoop(ctx)
   726  
   727  	tickCounter := atomic.NewInt32(0)
   728  	mockJobOperator.EXPECT().
   729  		Tick(gomock.Any()).AnyTimes().
   730  		DoAndReturn(func(ctx context.Context) error {
   731  			tickCounter.Add(1)
   732  			return nil
   733  		})
   734  	wg.Go(func() error {
   735  		for i := 0; i < 6; i++ {
   736  			mgr.jobOperatorNotifier.Notify()
   737  			time.Sleep(time.Millisecond * 50)
   738  		}
   739  		return nil
   740  	})
   741  	require.Eventually(t, func() bool {
   742  		return tickCounter.Load() > 0
   743  	}, time.Second, time.Millisecond*100)
   744  
   745  	mgr.CloseImpl(ctx)
   746  	require.NoError(t, mgr.wg.Wait())
   747  }
   748  
   749  // TODO: refine the interface of JobManager and use mock JobManager in test
   750  func dispatchJobAndMeetError(
   751  	ctx context.Context, t *testing.T, mgr *JobManagerImpl, meta *frameModel.MasterMeta,
   752  ) {
   753  	err := mgr.frameMetaClient.UpsertJob(ctx, meta)
   754  	require.NoError(t, err)
   755  
   756  	// dispatch job, meet error and move it to pending job list
   757  	mgr.JobFsm.JobDispatched(&frameModel.MasterMeta{ID: meta.ID}, false)
   758  	require.NotNil(t, mgr.QueryJob(meta.ID))
   759  	mockHandle := &framework.MockHandle{WorkerID: meta.ID}
   760  	mgr.JobFsm.JobOffline(mockHandle, true /* needFailover */)
   761  }
   762  
   763  func TestJobManagerIterPendingJobs(t *testing.T) {
   764  	t.Parallel()
   765  
   766  	ctx, cancel := context.WithCancel(context.Background())
   767  	defer cancel()
   768  
   769  	masterImpl := framework.NewMockMasterImpl(t, "", "iter-pending-jobs-test")
   770  	framework.MockMasterPrepareMeta(ctx, t, masterImpl)
   771  	mockMaster := &mockBaseMasterCreateWorkerFailed{
   772  		MockMasterImpl: masterImpl,
   773  	}
   774  	ctrl := gomock.NewController(t)
   775  	mockBackoffMgr := jobopMock.NewMockBackoffManager(ctrl)
   776  	mockJobOperator := jobopMock.NewMockJobOperator(ctrl)
   777  	mgr := &JobManagerImpl{
   778  		BaseMaster:      mockMaster,
   779  		JobFsm:          NewJobFsm(),
   780  		uuidGen:         uuid.NewGenerator(),
   781  		frameMetaClient: mockMaster.GetFrameMetaClient(),
   782  		jobHTTPClient:   jobMock.NewMockNilReturnJobHTTPClient(),
   783  		JobBackoffMgr:   mockBackoffMgr,
   784  		jobOperator:     mockJobOperator,
   785  	}
   786  	mockMaster.Impl = mgr
   787  	err := mockMaster.Init(ctx)
   788  	require.NoError(t, err)
   789  
   790  	newMasterMeta := func(jobID string) *frameModel.MasterMeta {
   791  		return &frameModel.MasterMeta{
   792  			ID:    jobID,
   793  			State: frameModel.MasterStateInit,
   794  		}
   795  	}
   796  
   797  	jobMgrTickAndCheckJobState := func(jobID string, state frameModel.MasterState) {
   798  		err := mgr.Tick(ctx)
   799  		require.NoError(t, err)
   800  		meta, err := mgr.frameMetaClient.GetJobByID(ctx, jobID)
   801  		require.NoError(t, err)
   802  		require.Equal(t, state, meta.State)
   803  	}
   804  
   805  	{
   806  		jobID := "job-backoff-test-1"
   807  		dispatchJobAndMeetError(ctx, t, mgr, newMasterMeta(jobID))
   808  
   809  		// job is being backoff
   810  		mockJobOperator.EXPECT().IsJobCanceling(ctx, jobID).Times(1).Return(false)
   811  		mockBackoffMgr.EXPECT().Terminate(jobID).Times(1).Return(false)
   812  		mockBackoffMgr.EXPECT().Allow(jobID).Times(1).Return(false)
   813  		err = mgr.Tick(ctx)
   814  		require.NoError(t, err)
   815  
   816  		// job will be terminated because it exceeds max try time
   817  		mockJobOperator.EXPECT().IsJobCanceling(ctx, jobID).Times(1).Return(false)
   818  		mockBackoffMgr.EXPECT().Terminate(jobID).Times(1).Return(true)
   819  		jobMgrTickAndCheckJobState(jobID, frameModel.MasterStateFailed)
   820  	}
   821  
   822  	{
   823  		jobID := "job-backoff-test-2"
   824  		dispatchJobAndMeetError(ctx, t, mgr, newMasterMeta(jobID))
   825  
   826  		// job will be terminated because it is canceled
   827  		mockJobOperator.EXPECT().IsJobCanceling(ctx, jobID).Times(1).Return(true)
   828  		jobMgrTickAndCheckJobState(jobID, frameModel.MasterStateStopped)
   829  	}
   830  }
   831  
   832  func TestFailoverWithCreateWorkerOpt(t *testing.T) {
   833  	t.Parallel()
   834  
   835  	ctx, cancel := context.WithCancel(context.Background())
   836  	defer cancel()
   837  
   838  	selectors := []*label.Selector{
   839  		{Key: "name", Target: "executor.*", Op: label.OpRegex},
   840  		{Key: "region", Target: "us-west-2", Op: label.OpEq},
   841  	}
   842  	checkOptsFn := func(opts ...framework.CreateWorkerOpt) {
   843  		// CreateWorkerOpt: 1 for label selectors
   844  		require.Len(t, opts, 1)
   845  	}
   846  
   847  	masterImpl := framework.NewMockMasterImpl(t, "", "iter-pending-jobs-test")
   848  	framework.MockMasterPrepareMeta(ctx, t, masterImpl)
   849  	mockMaster := &mockBaseMasterCheckCreateOpts{
   850  		MockMasterImpl: masterImpl,
   851  		checkOptsFn:    checkOptsFn,
   852  	}
   853  	ctrl := gomock.NewController(t)
   854  	mockBackoffMgr := jobopMock.NewMockBackoffManager(ctrl)
   855  	mockJobOperator := jobopMock.NewMockJobOperator(ctrl)
   856  	mgr := &JobManagerImpl{
   857  		BaseMaster:      mockMaster,
   858  		JobFsm:          NewJobFsm(),
   859  		uuidGen:         uuid.NewGenerator(),
   860  		frameMetaClient: mockMaster.GetFrameMetaClient(),
   861  		jobHTTPClient:   jobMock.NewMockNilReturnJobHTTPClient(),
   862  		JobBackoffMgr:   mockBackoffMgr,
   863  		jobOperator:     mockJobOperator,
   864  	}
   865  	mockMaster.Impl = mgr
   866  	err := mockMaster.Init(ctx)
   867  	require.NoError(t, err)
   868  
   869  	{
   870  		job := &frameModel.MasterMeta{
   871  			ID:    "failover-job-with-label",
   872  			State: frameModel.MasterStateInit,
   873  			Ext:   frameModel.MasterMetaExt{Selectors: selectors},
   874  		}
   875  		dispatchJobAndMeetError(ctx, t, mgr, job)
   876  
   877  		mockJobOperator.EXPECT().IsJobCanceling(ctx, job.ID).Times(1).Return(false)
   878  		mockBackoffMgr.EXPECT().Terminate(job.ID).Times(1).Return(false)
   879  		mockBackoffMgr.EXPECT().Allow(job.ID).Times(1).Return(true)
   880  		err := mgr.Tick(ctx)
   881  		require.NoError(t, err)
   882  	}
   883  }
   884  
   885  type mockBaseMasterCheckCreateOpts struct {
   886  	*framework.MockMasterImpl
   887  	checkOptsFn func(opts ...framework.CreateWorkerOpt)
   888  }
   889  
   890  func (m *mockBaseMasterCheckCreateOpts) CreateWorker(
   891  	workerType framework.WorkerType,
   892  	config framework.WorkerConfig,
   893  	opts ...framework.CreateWorkerOpt,
   894  ) (frameModel.WorkerID, error) {
   895  	m.checkOptsFn(opts...)
   896  	return uuid.NewGenerator().NewString(), nil
   897  }
   898  
   899  func TestIsJobTerminated(t *testing.T) {
   900  	require.False(t, isJobTerminated(frameModel.MasterStateUninit))
   901  	require.False(t, isJobTerminated(frameModel.MasterStateInit))
   902  	require.True(t, isJobTerminated(frameModel.MasterStateFinished))
   903  	require.True(t, isJobTerminated(frameModel.MasterStateFailed))
   904  	require.True(t, isJobTerminated(frameModel.MasterStateStopped))
   905  }
   906  
   907  func TestBuildPBJob(t *testing.T) {
   908  	t.Parallel()
   909  
   910  	testCases := []struct {
   911  		masterMeta    *frameModel.MasterMeta
   912  		includeConfig bool
   913  		job           *pb.Job
   914  	}{
   915  		{
   916  			masterMeta: &frameModel.MasterMeta{
   917  				ID:     "job-1",
   918  				Type:   frameModel.CvsJobMaster,
   919  				State:  frameModel.MasterStateUninit,
   920  				Config: []byte("job-1-config"),
   921  				Detail: []byte("job-1-detail"),
   922  			},
   923  			includeConfig: true,
   924  			job: &pb.Job{
   925  				Id:     "job-1",
   926  				Type:   pb.Job_CVSDemo,
   927  				State:  pb.Job_Created,
   928  				Error:  &pb.Job_Error{},
   929  				Config: []byte("job-1-config"),
   930  				Detail: []byte("job-1-detail"),
   931  			},
   932  		},
   933  		{
   934  			masterMeta: &frameModel.MasterMeta{
   935  				ID:     "job-2",
   936  				Type:   frameModel.DMJobMaster,
   937  				State:  frameModel.MasterStateInit,
   938  				Config: []byte("job-2-config"),
   939  				Detail: []byte("job-2-detail"),
   940  			},
   941  			includeConfig: true,
   942  			job: &pb.Job{
   943  				Id:     "job-2",
   944  				Type:   pb.Job_DM,
   945  				State:  pb.Job_Running,
   946  				Error:  &pb.Job_Error{},
   947  				Config: []byte("job-2-config"),
   948  				Detail: []byte("job-2-detail"),
   949  			},
   950  		},
   951  		{
   952  			masterMeta: &frameModel.MasterMeta{
   953  				ID:     "job-3",
   954  				Type:   frameModel.CdcJobMaster,
   955  				State:  frameModel.MasterStateStopped,
   956  				Config: []byte("job-3-config"),
   957  				Detail: []byte("job-3-detail"),
   958  			},
   959  			includeConfig: true,
   960  			job: &pb.Job{
   961  				Id:     "job-3",
   962  				Type:   pb.Job_CDC,
   963  				State:  pb.Job_Canceled,
   964  				Error:  &pb.Job_Error{},
   965  				Config: []byte("job-3-config"),
   966  				Detail: []byte("job-3-detail"),
   967  			},
   968  		},
   969  		{
   970  			masterMeta: &frameModel.MasterMeta{
   971  				ID:     "job-4",
   972  				Type:   frameModel.FakeJobMaster,
   973  				State:  frameModel.MasterStateFinished,
   974  				Config: []byte("job-4-config"),
   975  				Detail: []byte("job-4-detail"),
   976  			},
   977  			job: &pb.Job{
   978  				Id:     "job-4",
   979  				Type:   pb.Job_FakeJob,
   980  				State:  pb.Job_Finished,
   981  				Error:  &pb.Job_Error{},
   982  				Detail: []byte("job-4-detail"),
   983  			},
   984  		},
   985  		{
   986  			masterMeta: &frameModel.MasterMeta{
   987  				ID:       "job-5",
   988  				Type:     frameModel.FakeJobMaster,
   989  				State:    frameModel.MasterStateFailed,
   990  				Config:   []byte("job-5-config"),
   991  				Detail:   []byte("job-5-detail"),
   992  				ErrorMsg: "job-5-error",
   993  			},
   994  			job: &pb.Job{
   995  				Id:    "job-5",
   996  				Type:  pb.Job_FakeJob,
   997  				State: pb.Job_Failed,
   998  				Error: &pb.Job_Error{
   999  					Message: "job-5-error",
  1000  				},
  1001  				Detail: []byte("job-5-detail"),
  1002  			},
  1003  		},
  1004  	}
  1005  
  1006  	for _, tc := range testCases {
  1007  		job, err := buildPBJob(tc.masterMeta, tc.includeConfig)
  1008  		require.NoError(t, err)
  1009  		require.True(t, proto.Equal(tc.job, job))
  1010  	}
  1011  }