github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/servermaster/jobmanager_test.go (about) 1 // Copyright 2022 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package servermaster 15 16 import ( 17 "context" 18 "fmt" 19 "math/rand" 20 "testing" 21 "time" 22 23 "github.com/golang/mock/gomock" 24 pb "github.com/pingcap/tiflow/engine/enginepb" 25 "github.com/pingcap/tiflow/engine/framework" 26 "github.com/pingcap/tiflow/engine/framework/metadata" 27 frameModel "github.com/pingcap/tiflow/engine/framework/model" 28 "github.com/pingcap/tiflow/engine/pkg/clock" 29 "github.com/pingcap/tiflow/engine/pkg/ctxmu" 30 resManager "github.com/pingcap/tiflow/engine/pkg/externalresource/manager" 31 jobMock "github.com/pingcap/tiflow/engine/pkg/httputil/mock" 32 "github.com/pingcap/tiflow/engine/pkg/notifier" 33 "github.com/pingcap/tiflow/engine/pkg/openapi" 34 pkgOrm "github.com/pingcap/tiflow/engine/pkg/orm" 35 "github.com/pingcap/tiflow/engine/servermaster/jobop" 36 jobopMock "github.com/pingcap/tiflow/engine/servermaster/jobop/mock" 37 "github.com/pingcap/tiflow/pkg/errors" 38 "github.com/pingcap/tiflow/pkg/label" 39 "github.com/pingcap/tiflow/pkg/notify" 40 "github.com/pingcap/tiflow/pkg/uuid" 41 "github.com/stretchr/testify/mock" 42 "github.com/stretchr/testify/require" 43 "go.uber.org/atomic" 44 "golang.org/x/sync/errgroup" 45 "google.golang.org/protobuf/proto" 46 ) 47 48 func prepareMockJobManager( 49 ctx context.Context, t *testing.T, masterID string, 50 ) (*framework.MockMasterImpl, *JobManagerImpl) { 51 mockMaster := framework.NewMockMasterImpl(t, "", masterID) 52 framework.MockMasterPrepareMeta(ctx, t, mockMaster) 53 mgr := &JobManagerImpl{ 54 BaseMaster: mockMaster.DefaultBaseMaster, 55 JobFsm: NewJobFsm(), 56 clocker: clock.New(), 57 uuidGen: uuid.NewGenerator(), 58 frameMetaClient: mockMaster.GetFrameMetaClient(), 59 masterMetaClient: metadata.NewMasterMetadataClient(metadata.JobManagerUUID, mockMaster.GetFrameMetaClient()), 60 jobStatusChangeMu: ctxmu.New(), 61 notifier: notifier.NewNotifier[resManager.JobStatusChangeEvent](), 62 jobOperatorNotifier: new(notify.Notifier), 63 jobHTTPClient: jobMock.NewMockNilReturnJobHTTPClient(), 64 } 65 return mockMaster, mgr 66 } 67 68 func TestJobManagerCreateJob(t *testing.T) { 69 t.Parallel() 70 71 ctx, cancel := context.WithCancel(context.Background()) 72 defer cancel() 73 74 masterID := "create-job-test" 75 mockMaster, mgr := prepareMockJobManager(ctx, t, masterID) 76 mockMaster.On("InitImpl", mock.Anything).Return(nil) 77 mockMaster.MasterClient().EXPECT().ScheduleTask( 78 gomock.Any(), 79 gomock.Any()).Return(&pb.ScheduleTaskResponse{}, errors.ErrClusterResourceNotEnough.FastGenByArgs()).Times(1) 80 wg, ctx := errgroup.WithContext(ctx) 81 mgr.wg = wg 82 // set master impl to JobManagerImpl 83 mockMaster.Impl = mgr 84 err := mockMaster.Init(ctx) 85 require.Nil(t, err) 86 req := &pb.CreateJobRequest{ 87 Job: &pb.Job{ 88 Type: pb.Job_CVSDemo, 89 Config: []byte("{\"srcHost\":\"0.0.0.0:1234\", \"dstHost\":\"0.0.0.0:1234\", \"srcDir\":\"data\", \"dstDir\":\"data1\"}"), 90 }, 91 } 92 job, err := mgr.CreateJob(ctx, req) 93 require.NoError(t, err) 94 95 require.Eventually(t, func() bool { 96 return mgr.JobFsm.QueryJob(job.Id) != nil 97 }, time.Second*2, time.Millisecond*20) 98 99 // Create a new job with the same id. 100 req = &pb.CreateJobRequest{ 101 Job: &pb.Job{ 102 Id: job.Id, 103 Type: pb.Job_CVSDemo, 104 Config: []byte("{\"srcHost\":\"0.0.0.0:1234\", \"dstHost\":\"0.0.0.0:1234\", \"srcDir\":\"data\", \"dstDir\":\"data1\"}"), 105 }, 106 } 107 _, err = mgr.CreateJob(ctx, req) 108 require.True(t, errors.Is(err, errors.ErrJobAlreadyExists)) 109 110 // delete a finished job, re-create job with the same id will meet error 111 err = mockMaster.GetFrameMetaClient().UpdateJob(ctx, job.Id, 112 map[string]interface{}{ 113 "state": frameModel.MasterStateFinished, 114 }, 115 ) 116 require.NoError(t, err) 117 _, err = mgr.DeleteJob(ctx, &pb.DeleteJobRequest{Id: job.Id}) 118 require.NoError(t, err) 119 _, err = mgr.CreateJob(ctx, req) 120 require.True(t, errors.Is(err, errors.ErrJobAlreadyExists)) 121 } 122 123 type mockBaseMasterCreateWorkerFailed struct { 124 *framework.MockMasterImpl 125 } 126 127 func (m *mockBaseMasterCreateWorkerFailed) CreateWorker( 128 workerType framework.WorkerType, 129 config framework.WorkerConfig, 130 opts ...framework.CreateWorkerOpt, 131 ) (frameModel.WorkerID, error) { 132 return "", errors.ErrMasterConcurrencyExceeded.FastGenByArgs() 133 } 134 135 func TestCreateWorkerReturnError(t *testing.T) { 136 t.Parallel() 137 138 ctx, cancel := context.WithCancel(context.Background()) 139 defer cancel() 140 141 masterImpl := framework.NewMockMasterImpl(t, "", "create-worker-with-error") 142 framework.MockMasterPrepareMeta(ctx, t, masterImpl) 143 mockMaster := &mockBaseMasterCreateWorkerFailed{ 144 MockMasterImpl: masterImpl, 145 } 146 mgr := &JobManagerImpl{ 147 BaseMaster: mockMaster, 148 JobFsm: NewJobFsm(), 149 uuidGen: uuid.NewGenerator(), 150 frameMetaClient: mockMaster.GetFrameMetaClient(), 151 } 152 mockMaster.Impl = mgr 153 err := mockMaster.Init(ctx) 154 require.Nil(t, err) 155 req := &pb.CreateJobRequest{ 156 Job: &pb.Job{ 157 Type: pb.Job_CVSDemo, 158 Config: []byte("{\"srcHost\":\"0.0.0.0:1234\", \"dstHost\":\"0.0.0.0:1234\", \"srcDir\":\"data\", \"dstDir\":\"data1\"}"), 159 }, 160 } 161 _, err = mgr.CreateJob(ctx, req) 162 require.Error(t, err) 163 require.Contains(t, err.Error(), "ErrMasterConcurrencyExceeded") 164 } 165 166 func TestJobManagerCancelJob(t *testing.T) { 167 t.Parallel() 168 169 ctx, cancel := context.WithCancel(context.Background()) 170 defer cancel() 171 172 masterID := "cancel-job-test" 173 mockMaster, mgr := prepareMockJobManager(ctx, t, masterID) 174 mockMaster.On("InitImpl", mock.Anything).Return(nil) 175 mgr.jobOperator = jobop.NewJobOperatorImpl(mgr.frameMetaClient, mgr) 176 177 cancelWorkerID := "cancel-worker-id" 178 meta := &frameModel.MasterMeta{ 179 ID: cancelWorkerID, 180 Type: frameModel.CvsJobMaster, 181 State: frameModel.MasterStateInit, 182 } 183 mgr.JobFsm.JobDispatched(meta, false) 184 185 err := mgr.frameMetaClient.UpsertJob(ctx, meta) 186 require.NoError(t, err) 187 mockWorkerHandle := &framework.MockHandle{WorkerID: cancelWorkerID, ExecutorID: "executor-1"} 188 err = mgr.JobFsm.JobOnline(mockWorkerHandle) 189 require.NoError(t, err) 190 191 req := &pb.CancelJobRequest{ 192 Id: cancelWorkerID, 193 } 194 job, err := mgr.CancelJob(ctx, req) 195 require.NoError(t, err) 196 require.Equal(t, pb.Job_Canceling, job.State) 197 198 for i := 0; i < 5; i++ { 199 err = mgr.jobOperator.Tick(ctx) 200 require.NoError(t, err) 201 require.Equal(t, i+1, mockWorkerHandle.SendMessageCount()) 202 } 203 204 req.Id = cancelWorkerID + "-unknown" 205 _, err = mgr.CancelJob(ctx, req) 206 require.Error(t, err) 207 require.True(t, errors.Is(err, errors.ErrJobNotFound)) 208 } 209 210 func TestJobManagerDeleteJob(t *testing.T) { 211 t.Parallel() 212 213 ctx, cancel := context.WithCancel(context.Background()) 214 defer cancel() 215 216 masterID := "delete-job-test" 217 mockMaster, mgr := prepareMockJobManager(ctx, t, masterID) 218 mockMaster.On("InitImpl", mock.Anything).Return(nil) 219 220 err := mgr.frameMetaClient.UpsertJob(ctx, &frameModel.MasterMeta{ 221 ID: "job-to-be-deleted", 222 Type: frameModel.FakeJobMaster, 223 State: frameModel.MasterStateStopped, 224 }) 225 require.NoError(t, err) 226 227 err = mgr.OnMasterRecovered(ctx) 228 require.NoError(t, err) 229 230 _, err = mgr.DeleteJob(ctx, &pb.DeleteJobRequest{ 231 Id: "job-to-be-deleted", 232 }) 233 require.NoError(t, err) 234 _, err = mgr.frameMetaClient.GetJobByID(ctx, "job-to-be-deleted") 235 require.True(t, pkgOrm.IsNotFoundError(err)) 236 } 237 238 func TestJobManagerGetJob(t *testing.T) { 239 t.Parallel() 240 241 ctx, cancel := context.WithCancel(context.Background()) 242 defer cancel() 243 244 testCases := []struct { 245 meta *frameModel.MasterMeta 246 expectedPBStatus pb.Job_State 247 }{ 248 { 249 &frameModel.MasterMeta{ 250 ID: "master-1", 251 Type: frameModel.FakeJobMaster, 252 State: frameModel.MasterStateUninit, 253 }, 254 pb.Job_Created, 255 }, 256 { 257 &frameModel.MasterMeta{ 258 ID: "master-2", 259 Type: frameModel.FakeJobMaster, 260 State: frameModel.MasterStateInit, 261 }, 262 pb.Job_Running, 263 }, 264 { 265 &frameModel.MasterMeta{ 266 ID: "master-3", 267 Type: frameModel.FakeJobMaster, 268 State: frameModel.MasterStateFinished, 269 }, 270 pb.Job_Finished, 271 }, 272 { 273 &frameModel.MasterMeta{ 274 ID: "master-4", 275 Type: frameModel.FakeJobMaster, 276 State: frameModel.MasterStateStopped, 277 }, 278 pb.Job_Canceled, 279 }, 280 } 281 282 mockMaster := framework.NewMockMasterImpl(t, "", "job-manager-get-job-test") 283 framework.MockMasterPrepareMeta(ctx, t, mockMaster) 284 for _, tc := range testCases { 285 cli := metadata.NewMasterMetadataClient(tc.meta.ID, mockMaster.GetFrameMetaClient()) 286 err := cli.Store(ctx, tc.meta) 287 require.Nil(t, err) 288 } 289 290 mgr := &JobManagerImpl{ 291 BaseMaster: mockMaster.DefaultBaseMaster, 292 JobFsm: NewJobFsm(), 293 uuidGen: uuid.NewGenerator(), 294 masterMetaClient: metadata.NewMasterMetadataClient(metadata.JobManagerUUID, mockMaster.GetFrameMetaClient()), 295 frameMetaClient: mockMaster.GetFrameMetaClient(), 296 jobHTTPClient: jobMock.NewMockNilReturnJobHTTPClient(), 297 } 298 299 statuses, err := mgr.GetJobStatuses(ctx) 300 require.NoError(t, err) 301 require.Len(t, statuses, len(testCases)+1) 302 303 for _, tc := range testCases { 304 req := &pb.GetJobRequest{ 305 Id: tc.meta.ID, 306 } 307 job, err := mgr.GetJob(ctx, req) 308 require.NoError(t, err) 309 require.Equal(t, tc.expectedPBStatus, job.GetState()) 310 311 require.Contains(t, statuses, tc.meta.ID) 312 require.Equal(t, tc.meta.State, statuses[tc.meta.ID]) 313 } 314 } 315 316 func TestJobManagerOnlineJob(t *testing.T) { 317 t.Parallel() 318 319 ctx, cancel := context.WithCancel(context.Background()) 320 defer cancel() 321 322 mockMaster := framework.NewMockMasterImpl(t, "", "submit-job-test") 323 framework.MockMasterPrepareMeta(ctx, t, mockMaster) 324 mockMaster.On("InitImpl", mock.Anything).Return(nil) 325 mockMaster.MasterClient().EXPECT().ScheduleTask(gomock.Any(), gomock.Any()). 326 Return(&pb.ScheduleTaskResponse{}, errors.ErrClusterResourceNotEnough.FastGenByArgs()).MinTimes(0) 327 mgr := &JobManagerImpl{ 328 BaseMaster: mockMaster.DefaultBaseMaster, 329 JobFsm: NewJobFsm(), 330 uuidGen: uuid.NewGenerator(), 331 frameMetaClient: mockMaster.GetFrameMetaClient(), 332 jobStatusChangeMu: ctxmu.New(), 333 } 334 // set master impl to JobManagerImpl 335 mockMaster.Impl = mgr 336 err := mockMaster.Init(ctx) 337 require.Nil(t, err) 338 req := &pb.CreateJobRequest{ 339 Job: &pb.Job{ 340 Type: pb.Job_CVSDemo, 341 Config: []byte("{\"srcHost\":\"0.0.0.0:1234\", \"dstHost\":\"0.0.0.0:1234\", \"srcDir\":\"data\", \"dstDir\":\"data1\"}"), 342 }, 343 } 344 job, err := mgr.CreateJob(ctx, req) 345 require.NoError(t, err) 346 347 err = mgr.JobFsm.JobOnline(&framework.MockHandle{ 348 WorkerID: job.Id, 349 ExecutorID: "executor-1", 350 }) 351 require.NoError(t, err) 352 require.Len(t, mgr.JobFsm.waitAckJobs, 0) 353 require.Len(t, mgr.JobFsm.onlineJobs, 1) 354 } 355 356 func TestJobManagerRecover(t *testing.T) { 357 t.Parallel() 358 359 ctx, cancel := context.WithCancel(context.Background()) 360 defer cancel() 361 362 mockMaster := framework.NewMockMasterImpl(t, "", "job-manager-recover-test") 363 framework.MockMasterPrepareMeta(ctx, t, mockMaster) 364 // prepare mockvk with two job masters 365 meta := []*frameModel.MasterMeta{ 366 { 367 ID: "master-1", 368 Type: frameModel.FakeJobMaster, 369 }, 370 { 371 ID: "master-2", 372 Type: frameModel.FakeJobMaster, 373 }, 374 } 375 for _, data := range meta { 376 cli := metadata.NewMasterMetadataClient(data.ID, mockMaster.GetFrameMetaClient()) 377 err := cli.Store(ctx, data) 378 require.Nil(t, err) 379 } 380 381 mgr := &JobManagerImpl{ 382 BaseMaster: mockMaster.DefaultBaseMaster, 383 JobFsm: NewJobFsm(), 384 uuidGen: uuid.NewGenerator(), 385 masterMetaClient: metadata.NewMasterMetadataClient(metadata.JobManagerUUID, mockMaster.GetFrameMetaClient()), 386 frameMetaClient: mockMaster.GetFrameMetaClient(), 387 jobHTTPClient: jobMock.NewMockNilReturnJobHTTPClient(), 388 } 389 err := mgr.OnMasterRecovered(ctx) 390 require.NoError(t, err) 391 require.Len(t, mgr.JobFsm.waitAckJobs, 3) 392 } 393 394 func TestJobManagerTickExceedQuota(t *testing.T) { 395 t.Parallel() 396 397 ctx, cancel := context.WithCancel(context.Background()) 398 defer cancel() 399 400 masterImpl := framework.NewMockMasterImpl(t, "", "create-worker-with-error") 401 framework.MockMasterPrepareMeta(ctx, t, masterImpl) 402 mockMaster := &mockBaseMasterCreateWorkerFailed{ 403 MockMasterImpl: masterImpl, 404 } 405 mgr := &JobManagerImpl{ 406 BaseMaster: mockMaster, 407 JobFsm: NewJobFsm(), 408 uuidGen: uuid.NewGenerator(), 409 frameMetaClient: mockMaster.GetFrameMetaClient(), 410 jobHTTPClient: jobMock.NewMockNilReturnJobHTTPClient(), 411 } 412 mockMaster.Impl = mgr 413 err := mockMaster.Init(ctx) 414 require.NoError(t, err) 415 416 mgr.JobFsm.JobDispatched(&frameModel.MasterMeta{ID: "failover-job-master"}, true) 417 // try to recreate failover job master, will meet quota error 418 err = mgr.Tick(ctx) 419 require.NoError(t, err) 420 require.Len(t, mgr.JobFsm.waitAckJobs, 1) 421 422 // try to recreate failover job master again, will meet quota error again 423 err = mgr.Tick(ctx) 424 require.NoError(t, err) 425 require.Len(t, mgr.JobFsm.waitAckJobs, 1) 426 } 427 428 func TestJobManagerWatchJobStatuses(t *testing.T) { 429 t.Parallel() 430 431 ctx, cancel := context.WithCancel(context.Background()) 432 defer cancel() 433 434 masterID := "delete-job-test" 435 mockMaster, mgr := prepareMockJobManager(ctx, t, masterID) 436 mockMaster.On("InitImpl", mock.Anything).Return(nil) 437 438 err := mgr.frameMetaClient.UpsertJob(ctx, &frameModel.MasterMeta{ 439 ID: "job-to-be-deleted", 440 Type: frameModel.FakeJobMaster, 441 State: frameModel.MasterStateStopped, 442 }) 443 require.NoError(t, err) 444 445 err = mgr.OnMasterRecovered(ctx) 446 require.NoError(t, err) 447 448 snap, stream, err := mgr.WatchJobStatuses(ctx) 449 require.NoError(t, err) 450 require.Equal(t, map[frameModel.MasterID]frameModel.MasterState{ 451 "delete-job-test": frameModel.MasterStateUninit, 452 "job-to-be-deleted": frameModel.MasterStateStopped, 453 }, snap) 454 455 _, err = mgr.DeleteJob(ctx, &pb.DeleteJobRequest{ 456 Id: "job-to-be-deleted", 457 }) 458 require.NoError(t, err) 459 460 event := <-stream.C 461 require.Equal(t, resManager.JobStatusChangeEvent{ 462 EventType: resManager.JobRemovedEvent, 463 JobID: "job-to-be-deleted", 464 }, event) 465 } 466 467 func TestGetJobDetailFromJobMaster(t *testing.T) { 468 t.Parallel() 469 470 ctx := context.TODO() 471 masterID := "get-job-detail" 472 mockMaster, mgr := prepareMockJobManager(ctx, t, masterID) 473 mockMaster.On("InitImpl", mock.Anything).Return(nil) 474 475 mockCtrl := gomock.NewController(t) 476 defer mockCtrl.Finish() 477 mockJobClient := jobMock.NewMockJobHTTPClient(mockCtrl) 478 mgr.jobHTTPClient = mockJobClient 479 480 masterMeta := &frameModel.MasterMeta{ 481 ID: "new-job", 482 Type: frameModel.FakeJobMaster, 483 // set state to running 484 State: frameModel.MasterStateInit, 485 Addr: "127.0.0.1:10340", 486 ErrorMsg: "error_message", 487 } 488 489 // normal case, return job detail 490 err := mgr.frameMetaClient.UpsertJob(ctx, masterMeta) 491 require.NoError(t, err) 492 493 mgr.JobFsm.JobDispatched(masterMeta, false) 494 err = mgr.JobFsm.JobOnline(&framework.MockHandle{ 495 WorkerID: "new-job", 496 ExecutorID: "executor-1", 497 }) 498 require.NoError(t, err) 499 500 mockJobClient.EXPECT().GetJobDetail(ctx, "127.0.0.1:10340", "new-job").Return([]byte("detail test"), nil).Times(1) 501 job, err := mgr.GetJob(ctx, &pb.GetJobRequest{Id: "new-job"}) 502 require.NoError(t, err) 503 require.True(t, proto.Equal(&pb.Job{ 504 Id: "new-job", 505 Type: pb.Job_FakeJob, 506 State: pb.Job_Running, 507 Detail: []byte("detail test"), 508 Error: &pb.Job_Error{ 509 Message: "error_message", 510 }, 511 }, job)) 512 513 // get job detail failed 514 err = mgr.frameMetaClient.UpsertJob(ctx, &frameModel.MasterMeta{ 515 ID: "new-job", 516 Type: frameModel.FakeJobMaster, 517 // set status code to running state 518 State: frameModel.MasterStateInit, 519 Addr: "127.0.0.1:10340", 520 ErrorMsg: "error_message", 521 }) 522 require.NoError(t, err) 523 524 mockJobClient.EXPECT(). 525 GetJobDetail(ctx, "127.0.0.1:10340", "new-job"). 526 Return(nil, &openapi.HTTPError{ 527 Code: string(errors.ErrJobNotRunning.RFCCode()), 528 Message: "job new-job is not running", 529 }). 530 Times(1) 531 job, err = mgr.GetJob(ctx, &pb.GetJobRequest{Id: "new-job"}) 532 require.NoError(t, err) 533 require.True(t, proto.Equal(&pb.Job{ 534 Id: "new-job", 535 Type: pb.Job_FakeJob, 536 State: pb.Job_Running, 537 Error: &pb.Job_Error{ 538 Code: "DFLOW:ErrJobNotRunning", 539 Message: "job new-job is not running", 540 }, 541 }, job)) 542 } 543 544 func TestListJobsPagination(t *testing.T) { 545 t.Parallel() 546 547 ctx, cancel := context.WithCancel(context.Background()) 548 defer cancel() 549 550 mockMaster := framework.NewMockMasterImpl(t, "", "job-manager-list-jobs-test") 551 masterMeta := mockMaster.DefaultBaseMaster.MasterMeta() 552 masterMeta.Type = frameModel.JobManager 553 err := mockMaster.GetFrameMetaClient().UpsertJob(ctx, masterMeta) 554 require.NoError(t, err) 555 556 const totalJobCount = 2000 557 558 jobIDs := make([]string, 0, totalJobCount) 559 for i := 0; i < totalJobCount; i++ { 560 jobID := fmt.Sprintf("job-%04d", i) 561 jobIDs = append(jobIDs, jobID) 562 cli := metadata.NewMasterMetadataClient(jobID, mockMaster.GetFrameMetaClient()) 563 require.NoError(t, cli.Store(ctx, &frameModel.MasterMeta{ 564 ID: jobID, 565 Type: frameModel.FakeJobMaster, 566 State: frameModel.MasterStateStopped, 567 })) 568 } 569 570 mgr := &JobManagerImpl{ 571 BaseMaster: mockMaster.DefaultBaseMaster, 572 JobFsm: NewJobFsm(), 573 uuidGen: uuid.NewGenerator(), 574 masterMetaClient: metadata.NewMasterMetadataClient(metadata.JobManagerUUID, mockMaster.GetFrameMetaClient()), 575 frameMetaClient: mockMaster.GetFrameMetaClient(), 576 jobHTTPClient: jobMock.NewMockNilReturnJobHTTPClient(), 577 } 578 579 // List jobs without specifying page size. 580 resp, err := mgr.ListJobs(ctx, &pb.ListJobsRequest{}) 581 require.NoError(t, err) 582 require.Len(t, resp.Jobs, defaultListPageSize) 583 for i := 0; i < defaultListPageSize; i++ { 584 require.Equal(t, jobIDs[i], resp.Jobs[i].Id) 585 } 586 require.Equal(t, jobIDs[defaultListPageSize-1], resp.NextPageToken) 587 588 // List jobs with huge page size. 589 resp, err = mgr.ListJobs(ctx, &pb.ListJobsRequest{PageSize: 10000}) 590 require.NoError(t, err) 591 require.Len(t, resp.Jobs, maxListPageSize) 592 593 // List all jobs with pagination. 594 var ( 595 respJobIDs []string 596 nextPageToken string 597 ) 598 pageSize := 123 599 for { 600 resp, err = mgr.ListJobs(ctx, &pb.ListJobsRequest{PageSize: int32(pageSize), PageToken: nextPageToken}) 601 require.NoError(t, err) 602 for _, job := range resp.Jobs { 603 respJobIDs = append(respJobIDs, job.Id) 604 } 605 if resp.NextPageToken == "" { 606 break 607 } 608 nextPageToken = resp.NextPageToken 609 } 610 require.Equal(t, jobIDs, respJobIDs) 611 } 612 613 func TestListJobWithFilter(t *testing.T) { 614 t.Parallel() 615 616 ctx, cancel := context.WithCancel(context.Background()) 617 defer cancel() 618 619 mockMaster := framework.NewMockMasterImpl(t, "", "job-manager-list-jobs-test") 620 masterMeta := mockMaster.DefaultBaseMaster.MasterMeta() 621 masterMeta.Type = frameModel.JobManager 622 err := mockMaster.GetFrameMetaClient().UpsertJob(ctx, masterMeta) 623 require.NoError(t, err) 624 625 allTypes := []frameModel.WorkerType{ 626 frameModel.CvsJobMaster, frameModel.FakeJobMaster, 627 frameModel.DMJobMaster, frameModel.CdcJobMaster, 628 } 629 allStates := []frameModel.MasterState{ 630 frameModel.MasterStateUninit, frameModel.MasterStateInit, 631 frameModel.MasterStateFinished, frameModel.MasterStateStopped, frameModel.MasterStateFailed, 632 } 633 rnd := rand.New(rand.NewSource(0)) 634 randType := func() frameModel.WorkerType { 635 return allTypes[rnd.Intn(len(allTypes))] 636 } 637 randState := func() frameModel.MasterState { 638 return allStates[rnd.Intn(len(allStates))] 639 } 640 641 const totalJobCount = maxListPageSize 642 countByType := make(map[frameModel.WorkerType]int) 643 countByState := make(map[frameModel.MasterState]int) 644 for i := 0; i < totalJobCount; i++ { 645 jobID := fmt.Sprintf("job-%04d", i) 646 cli := metadata.NewMasterMetadataClient("job-1", mockMaster.GetFrameMetaClient()) 647 masterMeta := &frameModel.MasterMeta{ 648 ID: jobID, 649 Type: randType(), 650 State: randState(), 651 } 652 require.NoError(t, cli.Store(ctx, masterMeta)) 653 countByType[masterMeta.Type]++ 654 countByState[masterMeta.State]++ 655 } 656 657 mgr := &JobManagerImpl{ 658 BaseMaster: mockMaster.DefaultBaseMaster, 659 JobFsm: NewJobFsm(), 660 uuidGen: uuid.NewGenerator(), 661 masterMetaClient: metadata.NewMasterMetadataClient(metadata.JobManagerUUID, mockMaster.GetFrameMetaClient()), 662 frameMetaClient: mockMaster.GetFrameMetaClient(), 663 jobHTTPClient: jobMock.NewMockNilReturnJobHTTPClient(), 664 } 665 666 // List jobs with filter. 667 // TODO: we should test all combinations of filters, but there's no convenient way 668 // to mapping worker type to job type and master state to job state. 669 resp, err := mgr.ListJobs(ctx, &pb.ListJobsRequest{ 670 PageSize: totalJobCount, 671 Type: pb.Job_FakeJob, 672 }) 673 require.NoError(t, err) 674 require.Len(t, resp.Jobs, countByType[frameModel.FakeJobMaster]) 675 676 resp, err = mgr.ListJobs(ctx, &pb.ListJobsRequest{ 677 PageSize: totalJobCount, 678 State: pb.Job_Running, 679 }) 680 require.NoError(t, err) 681 require.Len(t, resp.Jobs, countByState[frameModel.MasterStateInit]) 682 } 683 684 func TestOnWorkerDispatchedFastFail(t *testing.T) { 685 t.Parallel() 686 687 ctx, cancel := context.WithCancel(context.Background()) 688 defer cancel() 689 690 masterID := "job-fast-fail-test" 691 mockMaster, mgr := prepareMockJobManager(ctx, t, masterID) 692 mockMaster.On("InitImpl", mock.Anything).Return(nil) 693 694 // simulate a job is created. 695 mgr.JobFsm.JobDispatched(mockMaster.MasterMeta(), false) 696 errorMsg := "unit test fast fail error" 697 mockHandle := &framework.MockHandle{WorkerID: masterID} 698 nerr := errors.ErrCreateWorkerTerminate.GenWithStack(errorMsg) 699 // OnWorkerDispatched callback on job manager, a terminated error will make 700 // job fast fail. 701 err := mgr.OnWorkerDispatched(mockHandle, nerr) 702 require.NoError(t, err) 703 meta, err := mgr.frameMetaClient.QueryJobsByState(ctx, 704 mockMaster.MasterMeta().ProjectID, int(frameModel.MasterStateFailed)) 705 require.NoError(t, err) 706 require.Len(t, meta, 1) 707 require.Equal(t, nerr.Error(), meta[0].ErrorMsg) 708 } 709 710 func TestJobOperatorBgLoop(t *testing.T) { 711 t.Parallel() 712 713 ctx, cancel := context.WithCancel(context.Background()) 714 defer cancel() 715 716 masterID := "job-operator-bg-loop-test" 717 mockMaster, mgr := prepareMockJobManager(ctx, t, masterID) 718 mockMaster.On("InitImpl", mock.Anything).Return(nil) 719 720 mockJobOperator := jobopMock.NewMockJobOperator(gomock.NewController(t)) 721 mgr.jobOperator = mockJobOperator 722 723 wg, ctx := errgroup.WithContext(ctx) 724 mgr.wg = wg 725 mgr.bgJobOperatorLoop(ctx) 726 727 tickCounter := atomic.NewInt32(0) 728 mockJobOperator.EXPECT(). 729 Tick(gomock.Any()).AnyTimes(). 730 DoAndReturn(func(ctx context.Context) error { 731 tickCounter.Add(1) 732 return nil 733 }) 734 wg.Go(func() error { 735 for i := 0; i < 6; i++ { 736 mgr.jobOperatorNotifier.Notify() 737 time.Sleep(time.Millisecond * 50) 738 } 739 return nil 740 }) 741 require.Eventually(t, func() bool { 742 return tickCounter.Load() > 0 743 }, time.Second, time.Millisecond*100) 744 745 mgr.CloseImpl(ctx) 746 require.NoError(t, mgr.wg.Wait()) 747 } 748 749 // TODO: refine the interface of JobManager and use mock JobManager in test 750 func dispatchJobAndMeetError( 751 ctx context.Context, t *testing.T, mgr *JobManagerImpl, meta *frameModel.MasterMeta, 752 ) { 753 err := mgr.frameMetaClient.UpsertJob(ctx, meta) 754 require.NoError(t, err) 755 756 // dispatch job, meet error and move it to pending job list 757 mgr.JobFsm.JobDispatched(&frameModel.MasterMeta{ID: meta.ID}, false) 758 require.NotNil(t, mgr.QueryJob(meta.ID)) 759 mockHandle := &framework.MockHandle{WorkerID: meta.ID} 760 mgr.JobFsm.JobOffline(mockHandle, true /* needFailover */) 761 } 762 763 func TestJobManagerIterPendingJobs(t *testing.T) { 764 t.Parallel() 765 766 ctx, cancel := context.WithCancel(context.Background()) 767 defer cancel() 768 769 masterImpl := framework.NewMockMasterImpl(t, "", "iter-pending-jobs-test") 770 framework.MockMasterPrepareMeta(ctx, t, masterImpl) 771 mockMaster := &mockBaseMasterCreateWorkerFailed{ 772 MockMasterImpl: masterImpl, 773 } 774 ctrl := gomock.NewController(t) 775 mockBackoffMgr := jobopMock.NewMockBackoffManager(ctrl) 776 mockJobOperator := jobopMock.NewMockJobOperator(ctrl) 777 mgr := &JobManagerImpl{ 778 BaseMaster: mockMaster, 779 JobFsm: NewJobFsm(), 780 uuidGen: uuid.NewGenerator(), 781 frameMetaClient: mockMaster.GetFrameMetaClient(), 782 jobHTTPClient: jobMock.NewMockNilReturnJobHTTPClient(), 783 JobBackoffMgr: mockBackoffMgr, 784 jobOperator: mockJobOperator, 785 } 786 mockMaster.Impl = mgr 787 err := mockMaster.Init(ctx) 788 require.NoError(t, err) 789 790 newMasterMeta := func(jobID string) *frameModel.MasterMeta { 791 return &frameModel.MasterMeta{ 792 ID: jobID, 793 State: frameModel.MasterStateInit, 794 } 795 } 796 797 jobMgrTickAndCheckJobState := func(jobID string, state frameModel.MasterState) { 798 err := mgr.Tick(ctx) 799 require.NoError(t, err) 800 meta, err := mgr.frameMetaClient.GetJobByID(ctx, jobID) 801 require.NoError(t, err) 802 require.Equal(t, state, meta.State) 803 } 804 805 { 806 jobID := "job-backoff-test-1" 807 dispatchJobAndMeetError(ctx, t, mgr, newMasterMeta(jobID)) 808 809 // job is being backoff 810 mockJobOperator.EXPECT().IsJobCanceling(ctx, jobID).Times(1).Return(false) 811 mockBackoffMgr.EXPECT().Terminate(jobID).Times(1).Return(false) 812 mockBackoffMgr.EXPECT().Allow(jobID).Times(1).Return(false) 813 err = mgr.Tick(ctx) 814 require.NoError(t, err) 815 816 // job will be terminated because it exceeds max try time 817 mockJobOperator.EXPECT().IsJobCanceling(ctx, jobID).Times(1).Return(false) 818 mockBackoffMgr.EXPECT().Terminate(jobID).Times(1).Return(true) 819 jobMgrTickAndCheckJobState(jobID, frameModel.MasterStateFailed) 820 } 821 822 { 823 jobID := "job-backoff-test-2" 824 dispatchJobAndMeetError(ctx, t, mgr, newMasterMeta(jobID)) 825 826 // job will be terminated because it is canceled 827 mockJobOperator.EXPECT().IsJobCanceling(ctx, jobID).Times(1).Return(true) 828 jobMgrTickAndCheckJobState(jobID, frameModel.MasterStateStopped) 829 } 830 } 831 832 func TestFailoverWithCreateWorkerOpt(t *testing.T) { 833 t.Parallel() 834 835 ctx, cancel := context.WithCancel(context.Background()) 836 defer cancel() 837 838 selectors := []*label.Selector{ 839 {Key: "name", Target: "executor.*", Op: label.OpRegex}, 840 {Key: "region", Target: "us-west-2", Op: label.OpEq}, 841 } 842 checkOptsFn := func(opts ...framework.CreateWorkerOpt) { 843 // CreateWorkerOpt: 1 for label selectors 844 require.Len(t, opts, 1) 845 } 846 847 masterImpl := framework.NewMockMasterImpl(t, "", "iter-pending-jobs-test") 848 framework.MockMasterPrepareMeta(ctx, t, masterImpl) 849 mockMaster := &mockBaseMasterCheckCreateOpts{ 850 MockMasterImpl: masterImpl, 851 checkOptsFn: checkOptsFn, 852 } 853 ctrl := gomock.NewController(t) 854 mockBackoffMgr := jobopMock.NewMockBackoffManager(ctrl) 855 mockJobOperator := jobopMock.NewMockJobOperator(ctrl) 856 mgr := &JobManagerImpl{ 857 BaseMaster: mockMaster, 858 JobFsm: NewJobFsm(), 859 uuidGen: uuid.NewGenerator(), 860 frameMetaClient: mockMaster.GetFrameMetaClient(), 861 jobHTTPClient: jobMock.NewMockNilReturnJobHTTPClient(), 862 JobBackoffMgr: mockBackoffMgr, 863 jobOperator: mockJobOperator, 864 } 865 mockMaster.Impl = mgr 866 err := mockMaster.Init(ctx) 867 require.NoError(t, err) 868 869 { 870 job := &frameModel.MasterMeta{ 871 ID: "failover-job-with-label", 872 State: frameModel.MasterStateInit, 873 Ext: frameModel.MasterMetaExt{Selectors: selectors}, 874 } 875 dispatchJobAndMeetError(ctx, t, mgr, job) 876 877 mockJobOperator.EXPECT().IsJobCanceling(ctx, job.ID).Times(1).Return(false) 878 mockBackoffMgr.EXPECT().Terminate(job.ID).Times(1).Return(false) 879 mockBackoffMgr.EXPECT().Allow(job.ID).Times(1).Return(true) 880 err := mgr.Tick(ctx) 881 require.NoError(t, err) 882 } 883 } 884 885 type mockBaseMasterCheckCreateOpts struct { 886 *framework.MockMasterImpl 887 checkOptsFn func(opts ...framework.CreateWorkerOpt) 888 } 889 890 func (m *mockBaseMasterCheckCreateOpts) CreateWorker( 891 workerType framework.WorkerType, 892 config framework.WorkerConfig, 893 opts ...framework.CreateWorkerOpt, 894 ) (frameModel.WorkerID, error) { 895 m.checkOptsFn(opts...) 896 return uuid.NewGenerator().NewString(), nil 897 } 898 899 func TestIsJobTerminated(t *testing.T) { 900 require.False(t, isJobTerminated(frameModel.MasterStateUninit)) 901 require.False(t, isJobTerminated(frameModel.MasterStateInit)) 902 require.True(t, isJobTerminated(frameModel.MasterStateFinished)) 903 require.True(t, isJobTerminated(frameModel.MasterStateFailed)) 904 require.True(t, isJobTerminated(frameModel.MasterStateStopped)) 905 } 906 907 func TestBuildPBJob(t *testing.T) { 908 t.Parallel() 909 910 testCases := []struct { 911 masterMeta *frameModel.MasterMeta 912 includeConfig bool 913 job *pb.Job 914 }{ 915 { 916 masterMeta: &frameModel.MasterMeta{ 917 ID: "job-1", 918 Type: frameModel.CvsJobMaster, 919 State: frameModel.MasterStateUninit, 920 Config: []byte("job-1-config"), 921 Detail: []byte("job-1-detail"), 922 }, 923 includeConfig: true, 924 job: &pb.Job{ 925 Id: "job-1", 926 Type: pb.Job_CVSDemo, 927 State: pb.Job_Created, 928 Error: &pb.Job_Error{}, 929 Config: []byte("job-1-config"), 930 Detail: []byte("job-1-detail"), 931 }, 932 }, 933 { 934 masterMeta: &frameModel.MasterMeta{ 935 ID: "job-2", 936 Type: frameModel.DMJobMaster, 937 State: frameModel.MasterStateInit, 938 Config: []byte("job-2-config"), 939 Detail: []byte("job-2-detail"), 940 }, 941 includeConfig: true, 942 job: &pb.Job{ 943 Id: "job-2", 944 Type: pb.Job_DM, 945 State: pb.Job_Running, 946 Error: &pb.Job_Error{}, 947 Config: []byte("job-2-config"), 948 Detail: []byte("job-2-detail"), 949 }, 950 }, 951 { 952 masterMeta: &frameModel.MasterMeta{ 953 ID: "job-3", 954 Type: frameModel.CdcJobMaster, 955 State: frameModel.MasterStateStopped, 956 Config: []byte("job-3-config"), 957 Detail: []byte("job-3-detail"), 958 }, 959 includeConfig: true, 960 job: &pb.Job{ 961 Id: "job-3", 962 Type: pb.Job_CDC, 963 State: pb.Job_Canceled, 964 Error: &pb.Job_Error{}, 965 Config: []byte("job-3-config"), 966 Detail: []byte("job-3-detail"), 967 }, 968 }, 969 { 970 masterMeta: &frameModel.MasterMeta{ 971 ID: "job-4", 972 Type: frameModel.FakeJobMaster, 973 State: frameModel.MasterStateFinished, 974 Config: []byte("job-4-config"), 975 Detail: []byte("job-4-detail"), 976 }, 977 job: &pb.Job{ 978 Id: "job-4", 979 Type: pb.Job_FakeJob, 980 State: pb.Job_Finished, 981 Error: &pb.Job_Error{}, 982 Detail: []byte("job-4-detail"), 983 }, 984 }, 985 { 986 masterMeta: &frameModel.MasterMeta{ 987 ID: "job-5", 988 Type: frameModel.FakeJobMaster, 989 State: frameModel.MasterStateFailed, 990 Config: []byte("job-5-config"), 991 Detail: []byte("job-5-detail"), 992 ErrorMsg: "job-5-error", 993 }, 994 job: &pb.Job{ 995 Id: "job-5", 996 Type: pb.Job_FakeJob, 997 State: pb.Job_Failed, 998 Error: &pb.Job_Error{ 999 Message: "job-5-error", 1000 }, 1001 Detail: []byte("job-5-detail"), 1002 }, 1003 }, 1004 } 1005 1006 for _, tc := range testCases { 1007 job, err := buildPBJob(tc.masterMeta, tc.includeConfig) 1008 require.NoError(t, err) 1009 require.True(t, proto.Equal(tc.job, job)) 1010 } 1011 }